PageRenderTime 33ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/munge/munge.R

https://bitbucket.org/trevorld/a3
R | 75 lines | 68 code | 6 blank | 1 comment | 1 complexity | a60656e0044615a9ec541a2631985a44 MD5 | raw file
  1. library("plyr")
  2. library("reshape2")
  3. library("zipcode")
  4. dir.create("cache", showWarnings=FALSE)
  5. get_unemployment_county_data_by_year <- function(year) {
  6. ur_file <- file.path("data", sprintf("laucnty%s.txt.gz", year))
  7. ur_file <- readLines(ur_file)
  8. ur_file <- ur_file[-which(grepl("^$", ur_file))]
  9. ur_file <- ur_file[-which(grepl("Labor Force Data", ur_file))]
  10. ur_file <- ur_file[-which(grepl("^[[:blank:]]+State", ur_file))]
  11. ur_file <- ur_file[-which(grepl("^[[:blank:]]+LAUS", ur_file))]
  12. if(any(grepl("SOURCE", ur_file))) { # 2007 and 2008 have SOURCE at bottom
  13. ur_file <- ur_file[-which(grepl("SOURCE", ur_file))]
  14. ur_file <- ur_file[-which(grepl("[[:alpha:]]+ [[:digit:]]+, [[:digit:]]{4}", ur_file))]
  15. }
  16. ur_file <- gsub("([[:digit:]]+),([[:digit:]])", "\\1\\2", ur_file)
  17. ur_file <- gsub("^ +", "", ur_file)
  18. ur_file <- gsub(" +", "\t", ur_file)
  19. ur_file <- gsub("^,", "", ur_file)
  20. ur_file <- gsub("N.A.", "NA", ur_file)
  21. ur_file[-1] <- gsub(",([[:alpha:]]+.*[[:alpha:]]+)", ',"\\1"', ur_file[-1])
  22. # ur_data <- read.delim(ur_file, skip=4, sep=" ")
  23. ur_county_data <- read.delim(textConnection(ur_file), sep="\t")
  24. names(ur_county_data) <- c("laus.code", "state.code", "county.code",
  25. "county", "year", "labor.force", "employed", "unemployed", "rate")
  26. return(ur_county_data)
  27. }
  28. strip_counties <- function(strings) {
  29. strings <- gsub(" County", "", strings)
  30. strings <- gsub(" Municipio", "", strings)
  31. strings <- gsub("Borough/municipality", "Municipality", strings)
  32. strings <- gsub("Borough/city", "City and Borough", strings)
  33. strings <- gsub("/city", "", strings)
  34. strings <- gsub("/town", "", strings)
  35. strings <- gsub("Prince of Wales-Outer Ketchikan Census Area, AK", "Prince of Wales-Hyder Census Area, AK", strings)
  36. strings <- gsub("Skagway-Hoonah-Angoon Census Area, AK", "Skagway Municipality, AK", strings)
  37. strings <- gsub("Wrangell-Petersburg Census Area, AK", "Wrangell City and Borough, AK", strings)
  38. strings <- gsub("District of Columbia", "District of Columbia, DC", strings)
  39. strings <- gsub("Dona Ana, NM", "Do\xf1a Ana, NM", strings)
  40. strings
  41. }
  42. years <- as.character(c(92:99, sprintf("0%s", 0:9), 10, 11))
  43. ur_county_data_list <- lapply(years, FUN=get_unemployment_county_data_by_year)
  44. ur_county_data <- do.call("rbind.fill", ur_county_data_list)
  45. ur_county_data <- ur_county_data[- grep("PR$", ur_county_data$county), ] # get rid of Puerto Rico observations
  46. ur_county_data$County.name <- strip_counties(ur_county_data$county)
  47. ur_county_data$state <- gsub(".*([[:alpha:]]{2}$)", "\\1", ur_county_data$County.name)
  48. ur_county_melted <- reshape2::melt(ur_county_data, id.vars=c("year", "state"), measure.vars=c("labor.force", "unemployed"))
  49. ur_usa_data <- reshape2::dcast(ur_county_melted, year ~ variable, sum, na.rm=TRUE)
  50. ur_usa_data <- transform(ur_usa_data, rate = 100 * unemployed / labor.force)
  51. ur_usa_data <- ur_usa_data[, c("year", "rate")]
  52. write.csv(ur_usa_data, "cache/us_unemployment_data.csv", row.names=FALSE)
  53. ur_state_data <- reshape2::dcast(ur_county_melted, year + state ~ variable, sum, na.rm=TRUE)
  54. ur_state_data <- transform(ur_state_data, rate = 100 * unemployed / labor.force)
  55. ur_state_data <- ur_state_data[, c("year", "state", "rate")]
  56. write.csv(ur_state_data, "cache/state_unemployment_data.csv", row.names=FALSE)
  57. zip_county_data <- read.csv("data/geocorr2010_county.csv.gz")
  58. zip_county_data <- na.omit(zip_county_data)
  59. zip_county_data$zip <- zipcode::clean.zipcodes(zip_county_data$ZCTA)
  60. data(zipcode)
  61. zip_county_data <- merge(zip_county_data, zipcode)
  62. zip_county_data <- zip_county_data[, c("County.name", "latitude", "longitude")]
  63. zip_county_melted <- reshape2::melt(zip_county_data, id.var="County.name")
  64. zip_county_data <- reshape2::dcast(zip_county_melted, County.name ~ variable, mean)
  65. ur_county_data <- merge(ur_county_data, zip_county_data)
  66. ur_county_data <- ur_county_data[, c("county", "year", "rate", "latitude", "longitude")]
  67. ur_county_data$county <- gsub("District of Columbia", "District of Columbia, DC", ur_county_data$county)
  68. ur_county_data <- plyr::arrange(ur_county_data, county, year)
  69. write.csv(ur_county_data, "cache/county_unemployment_data.csv", row.names=FALSE, na="")