/day1/tidy.R

https://github.com/ikashnitsky/dataviz-mpidr · R · 96 lines · 31 code · 38 blank · 27 comment · 0 complexity · e654eb28083bdd2cff5c4c01b3c1bf85 MD5 · raw file

  1. #===============================================================================
  2. # 2021-06-14 -- MPIDR dataviz
  3. # Tidy data
  4. # Ilya Kashnitsky, ilya.kashnitsky@gmail.com
  5. #===============================================================================
  6. # load the package
  7. library(tidyverse)
  8. library(magrittr)
  9. # Read the data with readxl -----------------------------------------------
  10. library(readxl)
  11. # see the names of the sheets
  12. readxl::excel_sheets("data/data-denmark.xlsx")
  13. deaths <- read_excel(path = "data/data-denmark.xlsx", sheet = "deaths")
  14. pop <- read_excel(path = "data/data-denmark.xlsx", sheet = "pop")
  15. # Reshaping data with tidyr -----------------------------------------------
  16. # to wide format
  17. pop_w <- pivot_wider(data = pop, names_from = year, values_from = value)
  18. # equivalently we can start using the piping operator ( %>% )
  19. pop_w <- pop %>%
  20. pivot_wider(names_from = year, values_from = value)
  21. # back to long format
  22. pop_l <- pop_w %>% pivot_longer(contains("200"), names_to = "year")
  23. # new pivot_* functions !!!
  24. # Basic dplyr functions ---------------------------------------------------
  25. # filter
  26. pop_filt <- pop %>% filter(year=="2003", !sex=="b")
  27. # magrittr !!!
  28. # select
  29. pop_select <- pop %>% select(contains("a"))
  30. # bind dfs
  31. df_bind <- bind_rows(pop, deaths)
  32. # join
  33. df_joined <- left_join(deaths, pop, by = c("year", "region", "sex", "age"))
  34. # rename
  35. df_re <- df_joined %>%
  36. rename(deaths = value.x, pop = value.y)
  37. # mutate
  38. df <- df_re %>% mutate(mx = deaths / pop)
  39. # transmute as a shortcut for both rename and mutate (+select)
  40. df_tr <- df_joined %>% transmute(region, sex, mx = value.x / value.y)
  41. # group %>% summarize %>% ungroup
  42. df_sum <- pop %>%
  43. group_by(region, sex, age) %>%
  44. summarise(mean = mean(value)) %>%
  45. ungroup()
  46. # summarise_if(is.numeric, ...)
  47. df_sum_if <- pop %>%
  48. spread(year, value) %>%
  49. group_by(sex, age) %>%
  50. summarise_if(.predicate = is.numeric, .funs = mean)
  51. # now we save the data frame to be used in the ggplot show
  52. df <- inner_join(deaths, pop, by = c("year","region","sex","age")) %>%
  53. rename(deaths = value.x, pop = value.y) %>%
  54. mutate(mx = deaths / pop)
  55. # saving data in Rdata (rda) format ---------------------------------------
  56. save(df, file = "data/Denmark.Rdata")
  57. # load the result again
  58. load("data/Denmark.Rdata")