/R/genius_url.R
https://github.com/JosiahParry/geniusR · R · 136 lines · 60 code · 25 blank · 51 comment · 1 complexity · 10700aa5b98e3ab567b6bb27fa697cbc MD5 · raw file
- if(getRversion() >= "2.15.1") {
- utils::globalVariables(c("type", "lyric", "line", "meta",
- "element_artist", "element", "track_title"))
- }
- #' Use Genius url to retrieve lyrics
- #'
- #' This function is used inside of the `genius_lyrics()` function. Given a url to a song on Genius, this function returns a tibble where each row is one line. Pair this function with `gen_song_url()` for easier access to song lyrics.
- #'
- #' @param url The url of song lyrics on Genius
- #' @param info Default \code{"title"}, returns the track title. Set to \code{"simple"} for only lyrics, \code{"artist"} for the lyrics and artist, \code{"features"} for song element and the artist of that element, \code{"all"} to return artist, track, line, lyric, element, and element artist.
- #'
- #' @examples
- #' \donttest{
- #' #' genius_url("https://genius.com/Head-north-in-the-water-lyrics", info = "all")
- #'
- #' # url <- gen_song_url(artist = "Kendrick Lamar", song = "HUMBLE")
- #'
- #' # genius_url(url)
- #'
- #'}
- #' @export
- #' @importFrom rvest session html_nodes html_node html_text
- #' @importFrom tidyr pivot_wider fill separate replace_na
- #' @importFrom stringr str_detect str_extract str_replace_all str_trim
- #' @importFrom tibble tibble
- #' @importFrom dplyr mutate bind_rows case_when filter group_by ungroup n row_number
- #' @importFrom purrr pluck
- genius_url <- function(url, info = "title") {
- # create a new session for scraping lyrics
- # create a new session for scraping lyrics
- genius_session <- session(url)
- # Container classes are frequently changing
- # need to id class based on partial name matching
- # get the classes of all children of divs to pattern match properly
- class_names <- genius_session %>%
- rvest::html_elements("div") %>%
- rvest::html_children() %>%
- rvest::html_attr("class") %>%
- unique() %>%
- stats::na.omit() %>%
- stringr::str_split("[:space:]") %>%
- unlist()
- # fetch class names for song title artist and lyrics
- # will need to add `.` for all of them
- title_class <- class_names[stringr::str_detect(class_names, "SongHeader__Title")]
- artist_class <- class_names[stringr::str_detect(class_names, "SongHeader__Artist")]
- lyrics_class <- class_names[stringr::str_detect(class_names, "Lyrics__Container")]
- # Get Artist name
- artist <- html_nodes(genius_session, paste0(".", artist_class)) %>%
- html_text() %>%
- str_replace_all("\n", "") %>%
- str_trim()
- # Get Song title
- song_title <- html_nodes(genius_session, paste0(".", title_class)) %>%
- html_text() %>%
- str_replace_all("\n", "") %>%
- str_trim()
- # scrape the lyrics
- lyrics <- # read the text from the lyrics class
- # read the text from the lyrics class
- html_node(genius_session, paste0(".", lyrics_class)) %>%
- # trim white space
- html_text(trim = TRUE) %>%
- # use named vector for cleaning purposes
- str_replace_all(cleaning()) %>%
- strsplit(split = "\n") %>%
- purrr::pluck(1) %>%
- # filter to only rows with content
- .[str_detect(., "[[:alnum:]]")] %>%
- # trim whitespace
- str_trim() %>%
- # Convert to tibble
- tibble(artist = artist,
- track_title = song_title,
- lyric = .) %>%
- mutate(line = row_number()) %>%
- bind_rows(tibble(lyric = c("", "[]"))) %>%
- mutate(type =
- case_when(
- str_detect(lyric, "\\[|\\]") ~ "meta",
- TRUE ~ "lyric")) %>%
- pivot_wider(names_from = type, values_from = lyric) %>%
- #spread(key = type, value = lyric)
- dplyr::filter(!is.na(line)) %>%
- fill(meta, .direction = "down") %>%
- #remove producer info
- #filter(!str_detect(lyric, "[Pp]roducer")) %>%
- #remove brackets
- mutate(meta = str_extract(meta, "[^\\[].*[^\\]]")) %>%
- #make "element" and "artist" columns
- # sections of a song are called an element. Artists are resopnsible for each element
- separate(meta, into = c("element", "element_artist"), sep = ": ", fill = "right") %>%
- #if song has no features
- mutate(element_artist = replace_na(element_artist, artist[1])) %>%
- # filter out NA's from spreading meta
- # this will keep the meta if there are no following lyrics
- # this is helpful to keep track of instrumentals
- group_by(element) %>%
- # if there is only one line (meaning only element info) keep the NA, else drop
- filter(ifelse(is.na(lyric) & n() > 1, FALSE, TRUE)) %>%
- ungroup() %>%
- # create new line numbers incase they have been messed up
- mutate(line = row_number())
- switch(info,
- simple = {return(select(lyrics, -artist, -track_title, -element, -element_artist))},
- artist = {return(select(lyrics, -track_title, -element, -element_artist))},
- title = {return(select(lyrics, -artist, -element, -element_artist))},
- features = {return(select(lyrics, -artist, -track_title))},
- all = return(lyrics)
- )
- }