/inst/scripts/Format_signatures.R

https://github.com/UMCUGenetics/MutationalPatterns · R · 191 lines · 87 code · 22 blank · 82 comment · 0 complexity · 403c69dfa78265ce3243a98014517e43 MD5 · raw file

  1. #Script to convert a signature file from SIGNAL into the correct format for MutationalPatterns
  2. library(dplyr)
  3. library(stringr)
  4. library(readr)
  5. library(magrittr)
  6. format_SIGNAL_signatures = function(fname){
  7. signatures =read.table(fname,
  8. header = TRUE,
  9. sep = "\t",
  10. stringsAsFactors = FALSE,
  11. dec = ",")
  12. colnames(signatures)[1] = "Type_subtype"
  13. signatures = signatures %>%
  14. dplyr::mutate(Type = str_replace(Type_subtype, ".*\\[(.*)\\].*", "\\1"),
  15. SubType = str_remove_all(Type_subtype, "\\[|\\]|\\>[A-Z]")) %>%
  16. dplyr::select(-Type_subtype) %>%
  17. dplyr::select(Type, SubType, everything())
  18. fname_base = basename(fname)
  19. out_path = file.path("inst", "extdata", "signatures", fname_base)
  20. write.table(signatures,
  21. out_path,
  22. sep = "\t",
  23. row.names = FALSE,
  24. quote = FALSE)
  25. invisible(0)
  26. }
  27. format_SIGNAL_signatures("~/Downloads/snv_SIGNAL_tissue.txt")
  28. format_SIGNAL_signatures("~/Downloads/snv_SIGNAL_reference.txt")
  29. format_SIGNAL_signatures("~/Downloads/snv_SIGNAL_exposure.txt")
  30. #DBS data was not on signature website, but has been extracted from the paper:
  31. # "A Compendium of Mutational Signatures of Environmental Agents
  32. #Add sparse signatures from the paper:
  33. # "De Novo Mutational Signature Discovery in Tumor Genomes using SparseSignatures"
  34. signatures = read_tsv("~/Downloads/snv_SPARSE.txt",
  35. col_types = cols(.default = "d", sig = "c"),
  36. locale=locale(decimal_mark = ","))
  37. signatures = signatures %>%
  38. tidyr::pivot_longer(-sig, names_to = "Type_subtype", values_to = "values") %>%
  39. tidyr::pivot_wider(names_from = sig, values_from = values) %>%
  40. dplyr::mutate(Type = str_replace(Type_subtype, ".*\\[(.*)\\].*", "\\1"),
  41. SubType = str_remove_all(Type_subtype, "\\[|\\]|\\>[A-Z]")) %>%
  42. dplyr::select(-Type_subtype) %>%
  43. dplyr::select(Type, SubType, everything())
  44. write.table(signatures,
  45. "inst/extdata/signatures/snv_SPARSE_reference.txt",
  46. sep = "\t",
  47. row.names = FALSE,
  48. quote = FALSE)
  49. #Format COSMIC signatures
  50. # COSMIC Version 3.1
  51. # format_COSMIC_signatures = function(in_fname, extra_sigs, out_fname, muttype){
  52. #
  53. # #Read main signature file
  54. # signatures = read.table(in_fname,
  55. # sep = ",",
  56. # stringsAsFactors = FALSE,
  57. # header = TRUE)
  58. #
  59. # if (!.is_na(extra_sigs)){
  60. # #Read separate signature files
  61. # sig_fnames = paste0("~/Downloads/sigProfiler_",
  62. # muttype,
  63. # "_signatures_",
  64. # extra_sigs,
  65. # ".csv")
  66. # sigs_to_add_m = purrr::map(sig_fnames, ~read.table(.x,
  67. # sep = ",",
  68. # stringsAsFactors = FALSE,
  69. # header = TRUE)) %>%
  70. # purrr::map(function(x) x[, ncol(x), drop = FALSE]) %>%
  71. # do.call(cbind, .)
  72. #
  73. # #Fix column names
  74. # colnames(sigs_to_add_m) = str_remove(colnames(sigs_to_add_m), "_GRCh37")
  75. #
  76. # #Combine in one single data.frame.
  77. # signatures = cbind(signatures, sigs_to_add_m)
  78. # }
  79. # #Write out
  80. # out_path = file.path("inst", "extdata", "signatures", out_fname)
  81. # write.table(signatures,
  82. # out_path,
  83. # sep = "\t",
  84. # row.names = FALSE,
  85. # quote = FALSE)
  86. # invisible(0)
  87. # }
  88. #
  89. # format_COSMIC_signatures("~/Downloads/sigProfiler_ID_signatures.csv",
  90. # paste0("ID", c(18)),
  91. # "indel_COSMIC_v3.1_reference.txt",
  92. # "ID")
  93. #
  94. # format_COSMIC_signatures("~/Downloads/sigProfiler_DBS_signatures.csv",
  95. # NA,
  96. # "dbs_COSMIC_v3.1_reference.txt",
  97. # NA)
  98. #
  99. # format_COSMIC_signatures("~/Downloads/sigProfiler_TSB_signatures.csv",
  100. # NA,
  101. # "tsb_snv_COSMIC_v3.1_reference.txt")
  102. #
  103. #
  104. # # Format Cosmic signatures for the SNVs
  105. # mut_mat <- readRDS(system.file("states/mut_mat_data.rds",
  106. # package = "MutationalPatterns"
  107. # ))
  108. #
  109. # # Read in Cosmic signatures 3.1
  110. # sbs_sigs = read.table("~/Downloads/cosmic_v3.1.txt", dec = ",", header = T) %>%
  111. # dplyr::mutate(cont = paste0(str_sub(Subtype, 1, 1), "[", Type, "]", str_sub(Subtype, 3))) %>%
  112. # dplyr::mutate(cont = factor(cont, levels = rownames(mut_mat))) %>%
  113. # dplyr::arrange(cont) %>%
  114. # dplyr::select(-cont)
  115. #
  116. #
  117. # sbs_sigs = as.matrix(sbs_sigs[,-c(1,2)])
  118. # write.table(sbs_sigs,
  119. # "~/surfdrive/Shared/Boxtel_General/Scripts/Git_submission/Freek_MutationalPatterns/MutationalPatterns/inst/extdata/signatures/snv_COSMIC_v3.1_reference.txt",
  120. # quote = F, row.names = F, sep = "\t")
  121. format_COSMIC_signaturesv3_2 = function(in_fname, out_fname){
  122. #Read main signature file
  123. signatures = read.table(in_fname,
  124. sep = "\t",
  125. stringsAsFactors = FALSE,
  126. header = TRUE)
  127. #Write out
  128. out_path = file.path("inst", "extdata", "signatures", out_fname)
  129. write.table(signatures,
  130. out_path,
  131. sep = "\t",
  132. row.names = FALSE,
  133. quote = FALSE)
  134. invisible(0)
  135. }
  136. format_COSMIC_signaturesv3_2("~/Downloads/COSMIC_v3.2_ID_GRCh37.txt",
  137. "indel_COSMIC_v3.2_reference_GRCh37.txt")
  138. format_COSMIC_signaturesv3_2("~/Downloads/COSMIC_v3.2_DBS_GRCh37.txt",
  139. "dbs_COSMIC_v3.2_reference_GRCh37.txt")
  140. format_COSMIC_signaturesv3_2("~/Downloads/COSMIC_v3.2_DBS_GRCh38.txt",
  141. "dbs_COSMIC_v3.2_reference_GRCh38.txt")
  142. format_COSMIC_signaturesv3_2("~/Downloads/COSMIC_v3.2_DBS_mm10.txt",
  143. "dbs_COSMIC_v3.2_reference_mm10.txt")
  144. # Format Cosmic signatures for the SNVs
  145. mut_mat <- readRDS(system.file("states/mut_mat_data.rds",
  146. package = "MutationalPatterns"
  147. ))
  148. # Read in Cosmic signatures 3.2
  149. format_COSMIC_snv_signatures = function(in_fname, genome, source, mut_mat){
  150. sbs_sigs = read.table(in_fname, dec = ",", header = TRUE) %>%
  151. dplyr::mutate(cont = Type,
  152. Type = str_replace(cont, ".*\\[(.*)\\].*", "\\1"),
  153. Subtype = str_remove(str_remove(cont, ">.*\\]"), "\\[")) %>%
  154. dplyr::mutate(cont = factor(cont, levels = rownames(mut_mat))) %>%
  155. dplyr::arrange(cont) %>%
  156. dplyr::select(-cont) %>%
  157. dplyr::relocate(Subtype, .after = Type)
  158. #sbs_sigs = as.matrix(sbs_sigs[,-c(1,2)])
  159. write.table(sbs_sigs,
  160. paste0("~/surfdrive/Shared/Boxtel_General/Scripts/Git_submission/Freek_MutationalPatterns/MutationalPatterns/inst/extdata/signatures/snv_",
  161. source,
  162. "_reference_",
  163. genome,
  164. ".txt"),
  165. quote = FALSE, row.names = FALSE, sep = "\t")
  166. }
  167. format_COSMIC_snv_signatures("~/Downloads/COSMIC_v3.2_SBS_GRCh37.txt", "GRCh37", "COSMIC_v3.2", mut_mat)
  168. format_COSMIC_snv_signatures("~/Downloads/COSMIC_v3.2_SBS_GRCh38.txt", "GRCh38", "COSMIC_v3.2", mut_mat)
  169. format_COSMIC_snv_signatures("~/Downloads/COSMIC_v3.2_SBS_mm10.txt", "mm10", "COSMIC_v3.2", mut_mat)