scrape_citydata.R - Data scrap of city-data.com for local g…

/scrape_citydata.R

https://github.com/eeg31/police-reform · R · 132 lines · 90 code · 28 blank · 14 comment · 2 complexity · c22eaf244e36c4a6bb3b272bb13ebcfb MD5 · raw file

library(tidyverse)
library(rvest)
library(janitor)

# Data scrap of city-data.com for local govt budgets
# @imaginary_nums 2020

# Input: url to state page
# Value: Tibble of city urls
get_city_urls <- function(url) {
  city_nodes <- url %>% 
    read_html %>% 
    html_nodes(xpath = '//*[@id="cityTAB"]') 
  
  urls <- city_nodes %>% 
    html_nodes("a") %>% 
    html_attr("href") %>% 
    as_tibble() %>% 
    rename(URL = value) %>% 
    mutate(URL = paste0("http://www.city-data.com/city/", URL))
  
  cities <- city_nodes %>%
    first %>%
    html_table %>%
    as_tibble %>%
    select(Name, Population) %>%
    cbind(urls) %>% 
    mutate(Population = as.integer(str_remove_all(Population, ","))) %>% 
    filter(URL %>% str_detect("html")) %>% 
    unique()
    
  return(cities)
}

# Input: url to city page
# Value: Tibble of city payroll data
get_city_payroll <- function(url) {
  city_page <- read_html(url)
  citystate <- city_page %>% 
    html_nodes(xpath = '/html/body/div[3]/div[4]/h1/span')  %>% 
    html_text() %>% 
    str_split(pattern = ", ") %>% 
    unlist
  
  city_name <- citystate[1]
  state_name <- citystate[2]
  
  city_govt_node <- city_page %>% 
    html_node(xpath = '//*[@id="government-employment"]') 
  
  if (is.na(city_govt_node) ){
    return(NULL)
  }
  
  city_govt <- city_govt_node %>% 
    html_node("table") %>% 
    html_table %>% 
    as_tibble
  
  date <- city_govt[1,1] %>% 
    str_extract("\\(.*\\)") %>% 
    str_remove_all("[\\(\\)]")
  
  # add in city, date data
  gov <- city_govt %>% tail(n = -1L) %>% 
    row_to_names(row_number = 1) %>% 
    mutate(date_str = date,
           city = city_name,
           state = state_name) 
  
  # remove spaces from column names
  gov %>% 
    names %>% 
    str_replace_all(" ", "_") -> 
    names(gov) 
  
  # data type conversion, reorder
  gov <- gov %>% 
    mutate_at(c(2,5), as.numeric) %>% 
    mutate_at(c(3,4,6), ~ as.numeric(str_remove_all(.x, pattern = "[$,]"))) %>% 
    select(c(9, 8, 1, 7, 2, 5, 3, 4, 6)) %>% 
    unique()
  
  
  return(gov)
}

main_url<- "http://www.city-data.com/"
main_page <- read_html(main_url)

# Get URLS for each state
states <- main_page %>% 
  html_nodes(xpath = '//*[@id="tabs_by_category"]/div[3]') %>% 
  html_nodes("a") %>% 
  html_attr("href") %>% 
  as_tibble() %>% 
  rename(URL = value) %>% 
  filter(URL %>% str_detect("/city/"))

# Get URLS for each city
cities <- states$URL %>%
  map_df(get_city_urls)

# Get budgets for entire country, large cities

large_cities <- cities %>% 
  filter(Population >= 450000)

large_city_budgets <- large_cities$URL %>% 
  map_df(get_city_payroll)


## Plot:
large_city_budgets %>% 
  select(state, city, Function, date_str,`Monthly_full-time_payroll`) %>%
  filter(str_detect(Function, "(Protection|Totals)")) %>%  
  filter(`Monthly_full-time_payroll` >= 0)  %>% 
  pivot_wider(names_from = Function, values_from =  `Monthly_full-time_payroll`) %>% 
  ggplot(aes(x=`Totals for Government`, y=`Police Protection - Officers`)) + 
  geom_point() +
  geom_text(aes(label=ifelse(`Police Protection - Officers` > 22000000, city, '')),hjust=0, vjust=0) +
  ylim(0, 100000000) +
  ggtitle("Municipal monthly spending, police vs total")
  

# get budgets, single state
ca_urls <- city_urls %>% 
  filter(URL %>% str_detect("California.html")) 
ca_budgets <- ca_urls$URL %>% 
  map_df(get_city_payroll)