/data/2020/2020-02-18/carbon_footprint.R

https://github.com/rfordatascience/tidytuesday · R · 76 lines · 50 code · 20 blank · 6 comment · 0 complexity · 357432a21c6866d5eb1959a6b8566b79 MD5 · raw file

  1. library(tidyverse)
  2. library(janitor)
  3. library(rvest)
  4. # Credit to Kasia
  5. # Blog post at https://r-tastic.co.uk/post/from-messy-to-tidy/
  6. url <- "https://www.nu3.de/blogs/nutrition/food-carbon-footprint-index-2018"
  7. # scrape the website
  8. url_html <- read_html(url)
  9. # extract the HTML table
  10. whole_table <- url_html %>%
  11. html_nodes('table') %>%
  12. html_table(fill = TRUE) %>%
  13. .[[1]]
  14. table_content <- whole_table %>%
  15. select(-X1) %>% # remove redundant column
  16. filter(!dplyr::row_number() %in% 1:3) # remove redundant rows
  17. raw_headers <- url_html %>%
  18. html_nodes(".thead-icon") %>%
  19. html_attr('title')
  20. tidy_bottom_header <- raw_headers[28:length(raw_headers)]
  21. tidy_bottom_header[1:10]
  22. raw_middle_header <- raw_headers[17:27]
  23. raw_middle_header
  24. tidy_headers <- c(
  25. rep(raw_middle_header[1:7], each = 2),
  26. "animal_total",
  27. rep(raw_middle_header[8:length(raw_middle_header)], each = 2),
  28. "non_animal_total",
  29. "country_total")
  30. tidy_headers
  31. combined_colnames <- paste(tidy_headers, tidy_bottom_header, sep = ';')
  32. colnames(table_content) <- c("Country", combined_colnames)
  33. glimpse(table_content[, 1:10])
  34. long_table <- table_content %>%
  35. # make column names observations of Category variable
  36. tidyr::pivot_longer(cols = -Country, names_to = "Category", values_to = "Values") %>%
  37. # separate food-related information from the metric
  38. tidyr::separate(col = Category, into = c("Food Category", "Metric"), sep = ';')
  39. glimpse(long_table)
  40. tidy_table <- long_table %>%
  41. tidyr::pivot_wider(names_from = Metric, values_from = Values) %>%
  42. janitor::clean_names('snake')
  43. glimpse(tidy_table)
  44. final_table <- tidy_table %>%
  45. rename(consumption = 3,
  46. co2_emmission = 4) %>%
  47. filter(!stringr::str_detect(food_category, "total"))
  48. clean_table <- final_table %>%
  49. mutate_at(vars(consumption, co2_emmission), parse_number)
  50. clean_table %>%
  51. write_csv(here::here("2020/2020-02-18", "food_consumption.csv"))
  52. clean_table %>%
  53. ggplot(aes(x = fct_reorder(food_category, consumption), y = consumption, color = country)) +
  54. geom_jitter() +
  55. theme(legend.position = "none") +
  56. coord_flip()