/data-raw/rowData.R

https://github.com/waldronlab/curatedMetagenomicData · R · 61 lines · 46 code · 15 blank · 0 comment · 0 complexity · 4819bbab2d64b5d6bf3979697a69035b MD5 · raw file

  1. marker_info_file <-
  2. base::as.character("https://zenodo.org/record/3955744/files/mpa_v30_CHOCOPhlAn_201901_marker_info.txt.bz2")
  3. marker_info_path <-
  4. base::tempfile()
  5. utils::download.file(url = marker_info_file, destfile = marker_info_path)
  6. marker_info_data <-
  7. readr::read_lines(marker_info_path, lazy = FALSE, progress = FALSE)
  8. ncbi_id <-
  9. stringr::str_extract(marker_info_data, "[0-9]+")
  10. rowname <-
  11. stringr::str_extract(marker_info_data, "k__.+(?=')")
  12. to_keep <-
  13. stringr::str_replace_na(rowname, replacement = "t__") |>
  14. stringr::str_detect("t__", negate = TRUE)
  15. ncbi_id <-
  16. magrittr::extract(ncbi_id, to_keep)
  17. rowname <-
  18. magrittr::extract(rowname, to_keep)
  19. marker_info_data <-
  20. base::data.frame(ncbi_id = ncbi_id, rowname = rowname) |>
  21. dplyr::distinct()
  22. ncbi_id <-
  23. marker_info_data[["ncbi_id"]]
  24. rowname <-
  25. marker_info_data[["rowname"]]
  26. rank_names <-
  27. base::c("superkingdom", "phylum", "class", "order", "family", "genus", "species")
  28. taxized <-
  29. taxize::classification(ncbi_id, db = "ncbi") |>
  30. purrr::map(~ dplyr::rename(.x, name = rank, value = name)) |>
  31. purrr::map(~ dplyr::select(.x, id, name, value)) |>
  32. purrr::map(~ dplyr::filter(.x, name %in% rank_names))
  33. rowDataLong <-
  34. purrr::map(taxized, ~ dplyr::select(.x, name, value)) |>
  35. purrr::map_dfr(~ tidyr::pivot_wider(.x, values_from = "value")) |>
  36. dplyr::select(tidyselect::all_of(rank_names)) |>
  37. dplyr::mutate(across(.fns = ~ base::as.character(.x))) |>
  38. S4Vectors::DataFrame() |>
  39. magrittr::set_rownames(rowname)
  40. rowDataNCBI <-
  41. purrr::map(taxized, ~ dplyr::select(.x, name, id)) |>
  42. purrr::map_dfr(~ tidyr::pivot_wider(.x, values_from = "id")) |>
  43. dplyr::select(tidyselect::all_of(rank_names)) |>
  44. dplyr::mutate(across(.fns = ~ base::as.integer(.x))) |>
  45. S4Vectors::DataFrame() |>
  46. magrittr::set_rownames(rowname)