/data/2020/2020-09-29/taylor-swift-beyonce.R

https://github.com/rfordatascience/tidytuesday · R · 139 lines · 111 code · 23 blank · 5 comment · 22 complexity · d7d1afbd9e1d1056f4580ee36b933913 MD5 · raw file

  1. library(tidyverse)
  2. library(rvest)
  3. ts_url <- "https://en.wikipedia.org/wiki/Taylor_Swift_discography"
  4. raw_ts_html <- ts_url %>%
  5. read_html()
  6. ts_raw <- raw_ts_html %>%
  7. html_node("#mw-content-text > div.mw-parser-output > table:nth-child(10)") %>%
  8. html_table(fill = TRUE) %>%
  9. data.frame() %>%
  10. janitor::clean_names() %>%
  11. tibble() %>%
  12. slice(-1, -nrow(.)) %>%
  13. mutate(album_details = str_split(album_details, "\n"),
  14. sales = str_split(sales, "\n"),
  15. ) %>%
  16. select(-certifications) %>%
  17. unnest_longer(album_details) %>%
  18. separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>%
  19. mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>%
  20. pivot_wider(names_from = album_detail_type, values_from = album_details) %>%
  21. select(-`na`) %>%
  22. janitor::clean_names()
  23. ts_sales <- ts_raw %>%
  24. unnest_longer(sales) %>%
  25. separate(sales, into = c("country", "sales"), sep = ": ") %>%
  26. mutate(sales = str_trim(sales),
  27. sales = parse_number(sales)) %>%
  28. select(title, country, sales, released:formats) %>%
  29. mutate(artist = "Taylor Swift", .before = title)
  30. ts_chart <- ts_raw %>%
  31. select(title, released:formats, contains("peak_chart")) %>%
  32. pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>%
  33. mutate(
  34. chart = str_remove(chart, "peak_chart_positions"),
  35. chart = case_when(
  36. chart == "" ~ "US",
  37. chart == "_1" ~ "AUS",
  38. chart == "_2" ~ "CAN",
  39. chart == "_3" ~ "FRA",
  40. chart == "_4" ~ "GER",
  41. chart == "_5" ~ "IRE",
  42. chart == "_6" ~ "JPN",
  43. chart == "_7" ~ "NZ",
  44. chart == "_8" ~ "SWE",
  45. chart == "_9" ~ "UK",
  46. TRUE ~ NA_character_
  47. )
  48. ) %>%
  49. mutate(artist = "Taylor Swift", .before = title)
  50. # Beyonce -----------------------------------------------------------------
  51. bey_url <- "https://en.wikipedia.org/wiki/Beyonc%C3%A9_discography"
  52. raw_bey_html <- bey_url %>%
  53. read_html()
  54. bey_raw <- raw_bey_html %>%
  55. html_node("#mw-content-text > div.mw-parser-output > table:nth-child(14)") %>%
  56. #mw-content-text > div.mw-parser-output > table:nth-child(14) > tbody > tr:nth-child(3) > th > i > a
  57. html_table(fill = TRUE) %>%
  58. data.frame() %>%
  59. janitor::clean_names() %>%
  60. tibble() %>%
  61. slice(-1, -nrow(.)) %>%
  62. mutate(album_details = str_split(album_details, "\n"),
  63. sales = str_split(sales, "\n"),
  64. ) %>%
  65. select(-certifications) %>%
  66. unnest_longer(album_details) %>%
  67. separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>%
  68. mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>%
  69. pivot_wider(names_from = album_detail_type, values_from = album_details) %>%
  70. janitor::clean_names()
  71. bey_sales <- bey_raw %>%
  72. unnest_longer(sales) %>%
  73. separate(sales, into = c("country", "sales"), sep = ": ") %>%
  74. mutate(sales = str_trim(sales),
  75. sales = parse_number(sales)) %>%
  76. select(title, country, sales, released:label, formats = format) %>%
  77. mutate(artist = "Beyoncé", .before = title)
  78. bey_chart <- bey_raw %>%
  79. select(title, released:label, formats = format, contains("peak_chart")) %>%
  80. pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>%
  81. mutate(
  82. chart = str_remove(chart, "peak_chart_positions"),
  83. chart = case_when(
  84. chart == "" ~ "US",
  85. chart == "_1" ~ "AUS",
  86. chart == "_2" ~ "CAN",
  87. chart == "_3" ~ "FRA",
  88. chart == "_4" ~ "GER",
  89. chart == "_5" ~ "IRE",
  90. chart == "_6" ~ "JPN",
  91. chart == "_7" ~ "NZ",
  92. chart == "_8" ~ "SWE",
  93. chart == "_9" ~ "UK",
  94. TRUE ~ NA_character_
  95. )
  96. ) %>%
  97. mutate(artist = "Beyoncé", .before = title)
  98. all_sales <- bind_rows(ts_sales, bey_sales)
  99. all_charts <- bind_rows(ts_chart, bey_chart)
  100. # Albums ------------------------------------------------------------------
  101. song_url <- "https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Beyonc%C3%A9"
  102. raw_song <- song_url %>%
  103. read_html() %>%
  104. html_table(fill = TRUE)
  105. albums <- raw_song[[2]] %>%
  106. tibble() %>%
  107. janitor::clean_names() %>%
  108. mutate(song = str_extract(song, '(?<=").*?(?=")')) %>%
  109. select(song, artist = artist_s, album = 4) %>%
  110. mutate(artist = "Beyoncé") %>%
  111. # filter(str_detect(album, paste0(c("Dangerously in Love", "B'Day", "I Am... Sasha Fierce", "4", "Beyoncé", "Lemonade"), collapse = "|"))) %>%
  112. # filter(str_detect(album, ":|2004", negate = TRUE)) %>%
  113. mutate(album = str_remove(album, "\\*"))
  114. raw_lyrics <- read_csv("https://gist.githubusercontent.com/sastoudt/cdc16a5a19cf9ae34db0231782231f27/raw/aa274f6c29273942dee5da34cb6c6a23ec67c8c8/beyLyricsNice.csv")
  115. raw_lyrics %>%
  116. left_join(albums, by = c("artist_name" = "artist", "song_name" = "song"))