/R/genius_url.R

https://github.com/JosiahParry/geniusR · R · 136 lines · 60 code · 25 blank · 51 comment · 1 complexity · 10700aa5b98e3ab567b6bb27fa697cbc MD5 · raw file

  1. if(getRversion() >= "2.15.1") {
  2. utils::globalVariables(c("type", "lyric", "line", "meta",
  3. "element_artist", "element", "track_title"))
  4. }
  5. #' Use Genius url to retrieve lyrics
  6. #'
  7. #' This function is used inside of the `genius_lyrics()` function. Given a url to a song on Genius, this function returns a tibble where each row is one line. Pair this function with `gen_song_url()` for easier access to song lyrics.
  8. #'
  9. #' @param url The url of song lyrics on Genius
  10. #' @param info Default \code{"title"}, returns the track title. Set to \code{"simple"} for only lyrics, \code{"artist"} for the lyrics and artist, \code{"features"} for song element and the artist of that element, \code{"all"} to return artist, track, line, lyric, element, and element artist.
  11. #'
  12. #' @examples
  13. #' \donttest{
  14. #' #' genius_url("https://genius.com/Head-north-in-the-water-lyrics", info = "all")
  15. #'
  16. #' # url <- gen_song_url(artist = "Kendrick Lamar", song = "HUMBLE")
  17. #'
  18. #' # genius_url(url)
  19. #'
  20. #'}
  21. #' @export
  22. #' @importFrom rvest session html_nodes html_node html_text
  23. #' @importFrom tidyr pivot_wider fill separate replace_na
  24. #' @importFrom stringr str_detect str_extract str_replace_all str_trim
  25. #' @importFrom tibble tibble
  26. #' @importFrom dplyr mutate bind_rows case_when filter group_by ungroup n row_number
  27. #' @importFrom purrr pluck
  28. genius_url <- function(url, info = "title") {
  29. # create a new session for scraping lyrics
  30. # create a new session for scraping lyrics
  31. genius_session <- session(url)
  32. # Container classes are frequently changing
  33. # need to id class based on partial name matching
  34. # get the classes of all children of divs to pattern match properly
  35. class_names <- genius_session %>%
  36. rvest::html_elements("div") %>%
  37. rvest::html_children() %>%
  38. rvest::html_attr("class") %>%
  39. unique() %>%
  40. stats::na.omit() %>%
  41. stringr::str_split("[:space:]") %>%
  42. unlist()
  43. # fetch class names for song title artist and lyrics
  44. # will need to add `.` for all of them
  45. title_class <- class_names[stringr::str_detect(class_names, "SongHeader__Title")]
  46. artist_class <- class_names[stringr::str_detect(class_names, "SongHeader__Artist")]
  47. lyrics_class <- class_names[stringr::str_detect(class_names, "Lyrics__Container")]
  48. # Get Artist name
  49. artist <- html_nodes(genius_session, paste0(".", artist_class)) %>%
  50. html_text() %>%
  51. str_replace_all("\n", "") %>%
  52. str_trim()
  53. # Get Song title
  54. song_title <- html_nodes(genius_session, paste0(".", title_class)) %>%
  55. html_text() %>%
  56. str_replace_all("\n", "") %>%
  57. str_trim()
  58. # scrape the lyrics
  59. lyrics <- # read the text from the lyrics class
  60. # read the text from the lyrics class
  61. html_node(genius_session, paste0(".", lyrics_class)) %>%
  62. # trim white space
  63. html_text(trim = TRUE) %>%
  64. # use named vector for cleaning purposes
  65. str_replace_all(cleaning()) %>%
  66. strsplit(split = "\n") %>%
  67. purrr::pluck(1) %>%
  68. # filter to only rows with content
  69. .[str_detect(., "[[:alnum:]]")] %>%
  70. # trim whitespace
  71. str_trim() %>%
  72. # Convert to tibble
  73. tibble(artist = artist,
  74. track_title = song_title,
  75. lyric = .) %>%
  76. mutate(line = row_number()) %>%
  77. bind_rows(tibble(lyric = c("", "[]"))) %>%
  78. mutate(type =
  79. case_when(
  80. str_detect(lyric, "\\[|\\]") ~ "meta",
  81. TRUE ~ "lyric")) %>%
  82. pivot_wider(names_from = type, values_from = lyric) %>%
  83. #spread(key = type, value = lyric)
  84. dplyr::filter(!is.na(line)) %>%
  85. fill(meta, .direction = "down") %>%
  86. #remove producer info
  87. #filter(!str_detect(lyric, "[Pp]roducer")) %>%
  88. #remove brackets
  89. mutate(meta = str_extract(meta, "[^\\[].*[^\\]]")) %>%
  90. #make "element" and "artist" columns
  91. # sections of a song are called an element. Artists are resopnsible for each element
  92. separate(meta, into = c("element", "element_artist"), sep = ": ", fill = "right") %>%
  93. #if song has no features
  94. mutate(element_artist = replace_na(element_artist, artist[1])) %>%
  95. # filter out NA's from spreading meta
  96. # this will keep the meta if there are no following lyrics
  97. # this is helpful to keep track of instrumentals
  98. group_by(element) %>%
  99. # if there is only one line (meaning only element info) keep the NA, else drop
  100. filter(ifelse(is.na(lyric) & n() > 1, FALSE, TRUE)) %>%
  101. ungroup() %>%
  102. # create new line numbers incase they have been messed up
  103. mutate(line = row_number())
  104. switch(info,
  105. simple = {return(select(lyrics, -artist, -track_title, -element, -element_artist))},
  106. artist = {return(select(lyrics, -track_title, -element, -element_artist))},
  107. title = {return(select(lyrics, -artist, -element, -element_artist))},
  108. features = {return(select(lyrics, -artist, -track_title))},
  109. all = return(lyrics)
  110. )
  111. }