munge.R | searchcode

/munge/munge.R

https://bitbucket.org/trevorld/a3
R | 75 lines | 68 code | 6 blank | 1 comment | 1 complexity | a60656e0044615a9ec541a2631985a44 MD5 | raw file

library("plyr")
library("reshape2")
library("zipcode")
dir.create("cache", showWarnings=FALSE)
get_unemployment_county_data_by_year <- function(year) {
    ur_file <- file.path("data", sprintf("laucnty%s.txt.gz", year))
    ur_file <- readLines(ur_file)
    ur_file <- ur_file[-which(grepl("^$", ur_file))]
    ur_file <- ur_file[-which(grepl("Labor Force Data", ur_file))]
    ur_file <- ur_file[-which(grepl("^[[:blank:]]+State", ur_file))]
    ur_file <- ur_file[-which(grepl("^[[:blank:]]+LAUS", ur_file))]
    if(any(grepl("SOURCE", ur_file))) { # 2007 and 2008 have SOURCE at bottom
        ur_file <- ur_file[-which(grepl("SOURCE", ur_file))]
        ur_file <- ur_file[-which(grepl("[[:alpha:]]+ [[:digit:]]+, [[:digit:]]{4}", ur_file))]
    }
    ur_file <- gsub("([[:digit:]]+),([[:digit:]])", "\\1\\2", ur_file)
    ur_file <- gsub("^  +", "", ur_file)
    ur_file <- gsub("  +", "\t", ur_file)
    ur_file <- gsub("^,", "", ur_file)
    ur_file <- gsub("N.A.", "NA", ur_file)
    ur_file[-1] <- gsub(",([[:alpha:]]+.*[[:alpha:]]+)", ',"\\1"', ur_file[-1])
    # ur_data <- read.delim(ur_file, skip=4, sep="  ")
    ur_county_data <- read.delim(textConnection(ur_file), sep="\t")
    names(ur_county_data) <- c("laus.code", "state.code", "county.code", 
            "county", "year", "labor.force", "employed", "unemployed", "rate")
    return(ur_county_data)
}
strip_counties <- function(strings) {
    strings <- gsub(" County", "", strings)
    strings <- gsub(" Municipio", "", strings)
    strings <- gsub("Borough/municipality", "Municipality", strings)
    strings <- gsub("Borough/city", "City and Borough", strings)
    strings <- gsub("/city", "", strings)
    strings <- gsub("/town", "", strings)
    strings <- gsub("Prince of Wales-Outer Ketchikan Census Area, AK", "Prince of Wales-Hyder Census Area, AK", strings)
    strings <- gsub("Skagway-Hoonah-Angoon Census Area, AK", "Skagway Municipality, AK", strings)
    strings <- gsub("Wrangell-Petersburg Census Area, AK", "Wrangell City and Borough, AK", strings)
    strings <- gsub("District of Columbia", "District of Columbia, DC", strings)
    strings <- gsub("Dona Ana, NM", "Do\xf1a Ana, NM", strings)
    strings
}


years <- as.character(c(92:99, sprintf("0%s", 0:9), 10, 11))
ur_county_data_list <- lapply(years, FUN=get_unemployment_county_data_by_year)
ur_county_data <- do.call("rbind.fill", ur_county_data_list)
ur_county_data <- ur_county_data[- grep("PR$", ur_county_data$county), ] # get rid of Puerto Rico observations
ur_county_data$County.name <- strip_counties(ur_county_data$county)
ur_county_data$state <- gsub(".*([[:alpha:]]{2}$)", "\\1", ur_county_data$County.name)

ur_county_melted <- reshape2::melt(ur_county_data, id.vars=c("year", "state"), measure.vars=c("labor.force", "unemployed"))
ur_usa_data <- reshape2::dcast(ur_county_melted, year ~ variable, sum, na.rm=TRUE)
ur_usa_data <- transform(ur_usa_data, rate = 100 * unemployed / labor.force)
ur_usa_data <- ur_usa_data[, c("year", "rate")]
write.csv(ur_usa_data, "cache/us_unemployment_data.csv", row.names=FALSE)

ur_state_data <- reshape2::dcast(ur_county_melted, year + state ~ variable, sum, na.rm=TRUE)
ur_state_data <- transform(ur_state_data, rate = 100 * unemployed / labor.force)
ur_state_data <- ur_state_data[, c("year", "state", "rate")]
write.csv(ur_state_data, "cache/state_unemployment_data.csv", row.names=FALSE)

zip_county_data <- read.csv("data/geocorr2010_county.csv.gz")
zip_county_data <- na.omit(zip_county_data)
zip_county_data$zip <- zipcode::clean.zipcodes(zip_county_data$ZCTA)
data(zipcode)
zip_county_data <- merge(zip_county_data, zipcode)
zip_county_data <- zip_county_data[, c("County.name", "latitude", "longitude")]
zip_county_melted <- reshape2::melt(zip_county_data, id.var="County.name")
zip_county_data <- reshape2::dcast(zip_county_melted, County.name ~ variable, mean)

ur_county_data <- merge(ur_county_data, zip_county_data)
ur_county_data <- ur_county_data[, c("county", "year", "rate", "latitude", "longitude")]
ur_county_data$county <- gsub("District of Columbia", "District of Columbia, DC", ur_county_data$county)
ur_county_data <- plyr::arrange(ur_county_data, county, year)
write.csv(ur_county_data, "cache/county_unemployment_data.csv", row.names=FALSE, na="")