/scrape_citydata.R

https://github.com/eeg31/police-reform · R · 132 lines · 90 code · 28 blank · 14 comment · 2 complexity · c22eaf244e36c4a6bb3b272bb13ebcfb MD5 · raw file

  1. library(tidyverse)
  2. library(rvest)
  3. library(janitor)
  4. # Data scrap of city-data.com for local govt budgets
  5. # @imaginary_nums 2020
  6. # Input: url to state page
  7. # Value: Tibble of city urls
  8. get_city_urls <- function(url) {
  9. city_nodes <- url %>%
  10. read_html %>%
  11. html_nodes(xpath = '//*[@id="cityTAB"]')
  12. urls <- city_nodes %>%
  13. html_nodes("a") %>%
  14. html_attr("href") %>%
  15. as_tibble() %>%
  16. rename(URL = value) %>%
  17. mutate(URL = paste0("http://www.city-data.com/city/", URL))
  18. cities <- city_nodes %>%
  19. first %>%
  20. html_table %>%
  21. as_tibble %>%
  22. select(Name, Population) %>%
  23. cbind(urls) %>%
  24. mutate(Population = as.integer(str_remove_all(Population, ","))) %>%
  25. filter(URL %>% str_detect("html")) %>%
  26. unique()
  27. return(cities)
  28. }
  29. # Input: url to city page
  30. # Value: Tibble of city payroll data
  31. get_city_payroll <- function(url) {
  32. city_page <- read_html(url)
  33. citystate <- city_page %>%
  34. html_nodes(xpath = '/html/body/div[3]/div[4]/h1/span') %>%
  35. html_text() %>%
  36. str_split(pattern = ", ") %>%
  37. unlist
  38. city_name <- citystate[1]
  39. state_name <- citystate[2]
  40. city_govt_node <- city_page %>%
  41. html_node(xpath = '//*[@id="government-employment"]')
  42. if (is.na(city_govt_node) ){
  43. return(NULL)
  44. }
  45. city_govt <- city_govt_node %>%
  46. html_node("table") %>%
  47. html_table %>%
  48. as_tibble
  49. date <- city_govt[1,1] %>%
  50. str_extract("\\(.*\\)") %>%
  51. str_remove_all("[\\(\\)]")
  52. # add in city, date data
  53. gov <- city_govt %>% tail(n = -1L) %>%
  54. row_to_names(row_number = 1) %>%
  55. mutate(date_str = date,
  56. city = city_name,
  57. state = state_name)
  58. # remove spaces from column names
  59. gov %>%
  60. names %>%
  61. str_replace_all(" ", "_") ->
  62. names(gov)
  63. # data type conversion, reorder
  64. gov <- gov %>%
  65. mutate_at(c(2,5), as.numeric) %>%
  66. mutate_at(c(3,4,6), ~ as.numeric(str_remove_all(.x, pattern = "[$,]"))) %>%
  67. select(c(9, 8, 1, 7, 2, 5, 3, 4, 6)) %>%
  68. unique()
  69. return(gov)
  70. }
  71. main_url<- "http://www.city-data.com/"
  72. main_page <- read_html(main_url)
  73. # Get URLS for each state
  74. states <- main_page %>%
  75. html_nodes(xpath = '//*[@id="tabs_by_category"]/div[3]') %>%
  76. html_nodes("a") %>%
  77. html_attr("href") %>%
  78. as_tibble() %>%
  79. rename(URL = value) %>%
  80. filter(URL %>% str_detect("/city/"))
  81. # Get URLS for each city
  82. cities <- states$URL %>%
  83. map_df(get_city_urls)
  84. # Get budgets for entire country, large cities
  85. large_cities <- cities %>%
  86. filter(Population >= 450000)
  87. large_city_budgets <- large_cities$URL %>%
  88. map_df(get_city_payroll)
  89. ## Plot:
  90. large_city_budgets %>%
  91. select(state, city, Function, date_str,`Monthly_full-time_payroll`) %>%
  92. filter(str_detect(Function, "(Protection|Totals)")) %>%
  93. filter(`Monthly_full-time_payroll` >= 0) %>%
  94. pivot_wider(names_from = Function, values_from = `Monthly_full-time_payroll`) %>%
  95. ggplot(aes(x=`Totals for Government`, y=`Police Protection - Officers`)) +
  96. geom_point() +
  97. geom_text(aes(label=ifelse(`Police Protection - Officers` > 22000000, city, '')),hjust=0, vjust=0) +
  98. ylim(0, 100000000) +
  99. ggtitle("Municipal monthly spending, police vs total")
  100. # get budgets, single state
  101. ca_urls <- city_urls %>%
  102. filter(URL %>% str_detect("California.html"))
  103. ca_budgets <- ca_urls$URL %>%
  104. map_df(get_city_payroll)