/day1/tidy-exercises-solutions.R

https://github.com/ikashnitsky/dataviz-mpidr · R · 107 lines · 38 code · 30 blank · 39 comment · 6 complexity · 08a4cffeef95f691c1913d862129e9c4 MD5 · raw file

  1. #===============================================================================
  2. # 2021-06-14 -- MPIDR dataviz
  3. # Tidy exercises
  4. # Ilya Kashnitsky, ilya.kashnitsky@gmail.com
  5. #===============================================================================
  6. # load the package
  7. library(tidyverse)
  8. # Read the data with readxl -----------------------------------------------
  9. library(readxl)
  10. # see the names of the sheets
  11. readxl::excel_sheets('data/data-denmark.xlsx')
  12. deaths <- read_excel(path = 'data/data-denmark.xlsx', sheet = 'deaths')
  13. pop <- read_excel(path = 'data/data-denmark.xlsx', sheet = 'pop')
  14. # Ex 1. deaths dataframe --------------------------------------------------
  15. # - subset only total number of deaths among men in year 2003 (filter)
  16. # Q: which region had the largest number of deaths?
  17. deaths %>%
  18. filter(year == "2003", sex == "m", age == "total") %>%
  19. arrange(value %>% desc) # sort in descending order by value
  20. # A: Region DK01 (which is Copenhagen area, no wonder here)
  21. # Ex 2. pop dataframe -----------------------------------------------------
  22. # - subset only the year 2004
  23. # - transform to wide format using the column "sex" (pivot_wider)
  24. # - get rid of the column for both sex
  25. # - calculate the sex ratio (males to females)
  26. # Q: in which region the SR is highest at ages 15, 45, over75 (coded as "open")
  27. pop %>%
  28. filter(year == "2004") %>%
  29. pivot_wider(names_from = sex, values_from = value) %>%
  30. select(-b) %>%
  31. mutate(sr = m / f) %>%
  32. filter(age %in% c(15, 45, "open")) %>%
  33. group_by(age) %>%
  34. arrange(sr %>% desc) %>%
  35. slice(1) # with this line we take only the first observation from each group
  36. # A: DK01 at age 15; DK03 at age 45; DK05 at age 75+
  37. # Ex 3. joined dataframe --------------------------------------------------
  38. # - join the two dataframes (left_join OR inner_join)
  39. # - calculate age specific death ratios
  40. # - subset only the ages 15-59 and year 2001
  41. # Q: what is the average ratio of male ASDR to female ASDR in each region?
  42. # Tip: use summarize
  43. df <- pop %>%
  44. inner_join(deaths, by = c("year", "region", "sex", "age")) %>%
  45. rename(pop = value.x, deaths = value.y) %>%
  46. mutate(mx = deaths / pop)
  47. df %>%
  48. filter(age %in% 15:59, year == "2001") %>%
  49. select(-year, -pop, -deaths) %>%
  50. pivot_wider(names_from = sex, values_from = mx) %>%
  51. select(-b) %>%
  52. mutate(sr_mx = (m / f) %>% na_if(Inf)) %>% # replace Inf with NAs
  53. group_by(region) %>%
  54. summarise(avg_mx_sr = sr_mx %>% mean(na.rm = TRUE)) %>%
  55. ungroup()
  56. # A:
  57. # region avg_mx_sr
  58. # <chr> <dbl>
  59. # 1 DK01 1.77
  60. # 2 DK02 2.22
  61. # 3 DK03 2.34
  62. # 4 DK04 2.06
  63. # 5 DK05 1.93
  64. # Ex 4. joined dataframe (df) ---------------------------------------------
  65. # - subset only both sex
  66. # - transform to wide format using the column "year" (pivot_wider)
  67. # - calculate the growth of ASDR between 2005 and 2001
  68. # Q: in which region the average growth/decrease in ASDR was largest?
  69. df %>%
  70. filter(sex == "b") %>%
  71. select(-pop, -deaths) %>%
  72. pivot_wider(names_from = year, values_from = mx) %>%
  73. mutate(mx_growth = (`2005`/`2001`) %>% na_if(Inf)) %>% # note the `` -- bare numbers are not allowed for R objects
  74. group_by(region) %>%
  75. summarise(avg_mx_growth = mx_growth %>% mean(na.rm = TRUE)) %>%
  76. arrange(avg_mx_growth %>% desc)
  77. # A: largest growth DK01 -- 1.03; largest decrease DK03 -- 0.9