tidy.R - Tidy data Ilya Kashnitsky, ilya.kashnitsky@gmail.c…

/day1/tidy.R

https://github.com/ikashnitsky/dataviz-mpidr · R · 96 lines · 31 code · 38 blank · 27 comment · 0 complexity · e654eb28083bdd2cff5c4c01b3c1bf85 MD5 · raw file

#===============================================================================
# 2021-06-14 -- MPIDR dataviz
# Tidy data
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com
#===============================================================================

# load the package
library(tidyverse)
library(magrittr)


# Read the data with readxl -----------------------------------------------

library(readxl)

# see the names of the sheets
readxl::excel_sheets("data/data-denmark.xlsx")
  
deaths <- read_excel(path = "data/data-denmark.xlsx", sheet = "deaths")
pop <- read_excel(path = "data/data-denmark.xlsx", sheet = "pop")


# Reshaping data with tidyr -----------------------------------------------

# to wide format
pop_w <- pivot_wider(data = pop, names_from = year, values_from = value)

# equivalently we can start using the piping operator ( %>% )
pop_w <- pop %>% 
  pivot_wider(names_from = year, values_from = value)

# back to long format
pop_l <- pop_w %>% pivot_longer(contains("200"), names_to = "year")

# new pivot_* functions !!!


# Basic dplyr functions ---------------------------------------------------

# filter
pop_filt <- pop %>% filter(year=="2003", !sex=="b")

# magrittr !!!

# select
pop_select <- pop %>% select(contains("a"))


# bind dfs
df_bind <- bind_rows(pop, deaths)


# join
df_joined <- left_join(deaths, pop, by = c("year", "region", "sex", "age")) 


# rename
df_re <- df_joined %>% 
  rename(deaths = value.x, pop = value.y)


# mutate 
df <- df_re %>% mutate(mx = deaths / pop)

# transmute as a shortcut for both rename and mutate (+select)
df_tr <- df_joined %>% transmute(region, sex, mx = value.x / value.y)



# group %>% summarize %>% ungroup
df_sum <- pop %>% 
  group_by(region, sex, age) %>% 
  summarise(mean = mean(value)) %>% 
  ungroup()


# summarise_if(is.numeric, ...)
df_sum_if <- pop %>% 
  spread(year, value) %>% 
  group_by(sex, age) %>% 
  summarise_if(.predicate = is.numeric, .funs = mean)


# now we save the data frame to be used in the ggplot show
df <- inner_join(deaths, pop, by = c("year","region","sex","age")) %>% 
  rename(deaths = value.x, pop = value.y) %>% 
  mutate(mx = deaths / pop)



# saving data in Rdata (rda) format ---------------------------------------

save(df, file = "data/Denmark.Rdata")

# load the result again
load("data/Denmark.Rdata")