/additional_preprocessing_code/census_pipeline/1_download_census_data.R

https://github.com/wxwx1993/PM_COVID · R · 146 lines · 116 code · 23 blank · 7 comment · 30 complexity · ab818a42d72ca50c13d1ffe5195c32a0 MD5 · raw file

  1. library(tidycensus)
  2. library(tigris)
  3. library(dplyr)
  4. library(yaml)
  5. library(tidyr)
  6. get_variables <- function(plan, var, year) {
  7. data_key <- names(plan[[var]])
  8. data_key <- data_key[year <= data_key][1]
  9. return(plan[[var]][[data_key]])
  10. }
  11. variable_path <- "census_vars.yml"
  12. variable_plan <- yaml.load_file(variable_path)
  13. ## ACS 5 year data available from 2009 forward, decennial data in 2000, 2010
  14. data_years <- c(2000, 2009:2018)
  15. out <- NULL
  16. for (year in data_years) {
  17. year_plan <- list()
  18. acs_vars <- NULL
  19. dec_vars <- NULL
  20. ## Figure out what to download and from where
  21. for (var in names(variable_plan)) {
  22. year_plan[[var]] <- get_variables(variable_plan, var, year)
  23. if (year_plan[[var]] == "skip") {
  24. next
  25. }
  26. if (names(year_plan[[var]]) == "acs") {
  27. acs_vars <- union(acs_vars,
  28. union(year_plan[[var]][["acs"]][["num"]],
  29. year_plan[[var]][["acs"]][["den"]]))
  30. } else if (names(year_plan[[var]]) == "census") {
  31. dec_vars <- union(dec_vars,
  32. union(year_plan[[var]][["census"]][["num"]],
  33. year_plan[[var]][["census"]][["den"]]))
  34. }
  35. }
  36. ## Make API calls
  37. if (!is.null(acs_vars)) {
  38. acs_data <- get_acs("county", variables = acs_vars, year = year)
  39. acs_data$moe <- NULL
  40. acs_data <- pivot_wider(acs_data,id_cols = c(GEOID, NAME), names_from = variable, values_from = estimate)
  41. }
  42. if (!is.null(dec_vars)) {
  43. if (year == 2000) {
  44. ## Need to split in to sf1 and sf3
  45. sf1_varlist <- load_variables(year = 2000, dataset = "sf1", cache = T)
  46. sf3_varlist <- load_variables(year = 2000, dataset = "sf3", cache = T)
  47. dec_data <- get_decennial("county",
  48. variables = intersect(setdiff(dec_vars, "P004002"),sf1_varlist$name),
  49. sumfile = "sf1",
  50. year = year)
  51. if ("P004002" %in% dec_vars) {
  52. dec_data <- rbind(dec_data,
  53. get_decennial("county",
  54. variables = "P004002",
  55. sumfile = "sf1",
  56. year = year))
  57. }
  58. dec_data <- rbind(dec_data,
  59. get_decennial("county",
  60. variables = setdiff(dec_vars, sf1_varlist$name),
  61. sumfile = "sf3",
  62. year = year))
  63. } else {
  64. dec_data <- get_decennial("county", variables = dec_vars, year = year)
  65. }
  66. dec_data <- pivot_wider(dec_data, id_cols = c(GEOID, NAME), names_from = variable, values_from = value)
  67. }
  68. ## Merge/unify variable names
  69. if (!is.null(acs_vars) & !is.null(dec_vars)) {
  70. data <- inner_join(acs_data, dec_data,suffix = c("", ".y"), by = c("GEOID"))
  71. data$NAME.y <- NULL
  72. rm(acs_data, dec_data)
  73. } else if (!is.null(acs_vars)) {
  74. data <- acs_data
  75. rm(acs_data)
  76. } else if (!is.null(dec_vars)) {
  77. data <- dec_data
  78. rm(dec_data)
  79. }
  80. ## Use variable plan to calculate values
  81. for (var in names(year_plan)) {
  82. if (year_plan[[var]] == "skip") {
  83. data[[var]] <- NA
  84. } else {
  85. data$num <- 0
  86. for (source_var in year_plan[[var]][[1]][["num"]]) {
  87. data$num <- data$num + data[[source_var]]
  88. }
  89. if (!is.null(year_plan[[var]][[1]][["den"]])) {
  90. data$den <- 0
  91. for (source_var in year_plan[[var]][[1]][["den"]]) {
  92. data$den <- data$den + data[[source_var]]
  93. }
  94. data[[var]] <- data$num/data$den
  95. data$num <- NULL
  96. data$den <- NULL
  97. } else {
  98. data[[var]] <- data$num
  99. data$num <- NULL
  100. }
  101. }
  102. }
  103. data <- select(data, c("GEOID","NAME", names(year_plan)))
  104. data$year <- year
  105. out <- rbind(out, data)
  106. }
  107. ## Create framework to allow missingness
  108. county_data <- counties()
  109. county_data <- as.tbl(data.frame(fips = paste0(county_data$STATEFP, county_data$COUNTYFP),
  110. NAME = county_data$NAMELSAD,
  111. land_area = as.numeric(county_data$ALAND)/2589988,
  112. stringsAsFactors = F))
  113. merged_data <- NULL
  114. for (year in 1999:2018) {
  115. county_data$year <- year
  116. merged_data <- rbind(merged_data, county_data)
  117. }
  118. out <- left_join(merged_data, out, by = c("fips" = "GEOID", "year"), suffix = c("", ".y"))
  119. out$NAME.y <- NULL
  120. if ("population" %in% names(variable_plan)) {
  121. out$population_density <- out$population/out$land_area
  122. }
  123. write.csv(out, "../data/census_data/census_county_uninterpolated.csv", row.names = F)