/munge/munge.R
R | 75 lines | 68 code | 6 blank | 1 comment | 1 complexity | a60656e0044615a9ec541a2631985a44 MD5 | raw file
- library("plyr")
- library("reshape2")
- library("zipcode")
- dir.create("cache", showWarnings=FALSE)
- get_unemployment_county_data_by_year <- function(year) {
- ur_file <- file.path("data", sprintf("laucnty%s.txt.gz", year))
- ur_file <- readLines(ur_file)
- ur_file <- ur_file[-which(grepl("^$", ur_file))]
- ur_file <- ur_file[-which(grepl("Labor Force Data", ur_file))]
- ur_file <- ur_file[-which(grepl("^[[:blank:]]+State", ur_file))]
- ur_file <- ur_file[-which(grepl("^[[:blank:]]+LAUS", ur_file))]
- if(any(grepl("SOURCE", ur_file))) { # 2007 and 2008 have SOURCE at bottom
- ur_file <- ur_file[-which(grepl("SOURCE", ur_file))]
- ur_file <- ur_file[-which(grepl("[[:alpha:]]+ [[:digit:]]+, [[:digit:]]{4}", ur_file))]
- }
- ur_file <- gsub("([[:digit:]]+),([[:digit:]])", "\\1\\2", ur_file)
- ur_file <- gsub("^ +", "", ur_file)
- ur_file <- gsub(" +", "\t", ur_file)
- ur_file <- gsub("^,", "", ur_file)
- ur_file <- gsub("N.A.", "NA", ur_file)
- ur_file[-1] <- gsub(",([[:alpha:]]+.*[[:alpha:]]+)", ',"\\1"', ur_file[-1])
- # ur_data <- read.delim(ur_file, skip=4, sep=" ")
- ur_county_data <- read.delim(textConnection(ur_file), sep="\t")
- names(ur_county_data) <- c("laus.code", "state.code", "county.code",
- "county", "year", "labor.force", "employed", "unemployed", "rate")
- return(ur_county_data)
- }
- strip_counties <- function(strings) {
- strings <- gsub(" County", "", strings)
- strings <- gsub(" Municipio", "", strings)
- strings <- gsub("Borough/municipality", "Municipality", strings)
- strings <- gsub("Borough/city", "City and Borough", strings)
- strings <- gsub("/city", "", strings)
- strings <- gsub("/town", "", strings)
- strings <- gsub("Prince of Wales-Outer Ketchikan Census Area, AK", "Prince of Wales-Hyder Census Area, AK", strings)
- strings <- gsub("Skagway-Hoonah-Angoon Census Area, AK", "Skagway Municipality, AK", strings)
- strings <- gsub("Wrangell-Petersburg Census Area, AK", "Wrangell City and Borough, AK", strings)
- strings <- gsub("District of Columbia", "District of Columbia, DC", strings)
- strings <- gsub("Dona Ana, NM", "Do\xf1a Ana, NM", strings)
- strings
- }
- years <- as.character(c(92:99, sprintf("0%s", 0:9), 10, 11))
- ur_county_data_list <- lapply(years, FUN=get_unemployment_county_data_by_year)
- ur_county_data <- do.call("rbind.fill", ur_county_data_list)
- ur_county_data <- ur_county_data[- grep("PR$", ur_county_data$county), ] # get rid of Puerto Rico observations
- ur_county_data$County.name <- strip_counties(ur_county_data$county)
- ur_county_data$state <- gsub(".*([[:alpha:]]{2}$)", "\\1", ur_county_data$County.name)
- ur_county_melted <- reshape2::melt(ur_county_data, id.vars=c("year", "state"), measure.vars=c("labor.force", "unemployed"))
- ur_usa_data <- reshape2::dcast(ur_county_melted, year ~ variable, sum, na.rm=TRUE)
- ur_usa_data <- transform(ur_usa_data, rate = 100 * unemployed / labor.force)
- ur_usa_data <- ur_usa_data[, c("year", "rate")]
- write.csv(ur_usa_data, "cache/us_unemployment_data.csv", row.names=FALSE)
- ur_state_data <- reshape2::dcast(ur_county_melted, year + state ~ variable, sum, na.rm=TRUE)
- ur_state_data <- transform(ur_state_data, rate = 100 * unemployed / labor.force)
- ur_state_data <- ur_state_data[, c("year", "state", "rate")]
- write.csv(ur_state_data, "cache/state_unemployment_data.csv", row.names=FALSE)
- zip_county_data <- read.csv("data/geocorr2010_county.csv.gz")
- zip_county_data <- na.omit(zip_county_data)
- zip_county_data$zip <- zipcode::clean.zipcodes(zip_county_data$ZCTA)
- data(zipcode)
- zip_county_data <- merge(zip_county_data, zipcode)
- zip_county_data <- zip_county_data[, c("County.name", "latitude", "longitude")]
- zip_county_melted <- reshape2::melt(zip_county_data, id.var="County.name")
- zip_county_data <- reshape2::dcast(zip_county_melted, County.name ~ variable, mean)
- ur_county_data <- merge(ur_county_data, zip_county_data)
- ur_county_data <- ur_county_data[, c("county", "year", "rate", "latitude", "longitude")]
- ur_county_data$county <- gsub("District of Columbia", "District of Columbia, DC", ur_county_data$county)
- ur_county_data <- plyr::arrange(ur_county_data, county, year)
- write.csv(ur_county_data, "cache/county_unemployment_data.csv", row.names=FALSE, na="")