R/lyric_functions.R

Defines functions get_word_stems find_stems find_bd_lyrics find_line_matches get_stem_match match_all_stems

Documented in find_bd_lyrics find_line_matches find_stems get_stem_match get_word_stems match_all_stems

#' Convert a sentence, or list of sentences, into a bag of unique stems
#'
#' For a provided text, this function will:
#'  - convert the sentence to a vector of words
#'  - convert each word to lower case
#'  - replace ’ with '
#'  - remove all punctuation other that '
#'  - replace -in' with -ing
#'  - remove snowball stop words (using tidy_text stopwords)
#'  - stem the words with SnowballC::wordStem
#'  - convert to a unique vector
#'
#' @param sentence a space separated string of words, or a list of the same
#' @export
#' @family lyric_functions
#' @return a vector of unique stems found in sentence


get_word_stems <- function(sentence) {
  if (class(sentence) == "list") sentence <- sentence %>% unlist() %>% paste(collapse = " ")
  words <- tolower(unlist(str_split(sentence, pattern = " ", simplify = FALSE)))
  words <- unlist(str_split(words, "-"))
  # convert curly apostrophe to straight one so that  stop_words can recognize it
  words <- str_replace(words, "’", "'")
  # remove punctuation except the '
  words <- str_replace(words, "[^[:alnum:][:space:]']", "")
  # some hard_coded replacements
  words <- str_replace(words, "^I'd$", "I")
  words <- str_replace(words, "^An'^", "And")
  # the stemmer is confused by -in', so we replace with -ing
  words <- str_replace(words, "in'$", "ing")

  # clean up
  words <- words[words != ""]

  words <- words[!words %in% (tidytext::stop_words %>% dplyr::filter(lexicon == "snowball") %>% pull(word))]
  stems <- wordStem(words) %>% unique()
  return(stems)
}


## there is a lot of other text processing that needs to occur
# - remove punctuation , . ? \" ! ”’ “ ‘ some words (esp names) seem to end with '
# - remove the a- at the beginning of some words
#- split words by - so that murder_mind becomes "murder" "mind"
# - check out words like ""worr'in’" - can they be converted to something reliably?
#         'til ev'ryth" should-a o-er it'd 'lectric ev'rybodi An’ (= And)
# what's with "3.\ti, 4.\ti etc


#' Find which stems in a given source text can be found in a given target text
#'
#' @export
#' @family lyric_functions

find_stems <- function(stems_to_search, stems_to_search_in) {
  return(stems_to_search[stems_to_search %in% stems_to_search_in])

}

#' Search for Bob Dylan Songs containing a given lyrical snippet
#'
#' @param lyrics a string of lyrics to search for
#'
#' @export
#' @family lyric_functions
#' @return a subset of bd_songs containing songs with at least one match with lyrics
#'


find_bd_lyrics <- function(lyrics) {
  stems <- get_word_stems(lyrics)
  hits <- match_all_stems(stems)
  names(hits) <- c("Song", "Hits")
  hits$Song <- as.character(hits$Song)
  return(hits)
}


#' Given a lyric snippet and a song name, return vector of matches by line
#'
#' @param song String name of a song in bd_songs$Song
#' @param lyrics String of lyrics to search
#' @param return_values if TRUE return list of stem matches, otherwise return a vector of counts of matches
#' @export
#' @family lyric_functions
#' @return a vector of match counts or a list of matching stems
#'

find_line_matches <- function(song, lyrics, return_values = FALSE) {
  lines <- unlist(bd_songs %>% filter(Song == song) %>% pull(StemsByLine), recursive = FALSE)
  lyric_stems <- get_word_stems(lyrics)
  matches <- map(lines, function(x) {
    find_stems(lyric_stems, x)
  })
  if (!return_values) matches = map_int(matches, length)
  return(matches)
}

#' Find the vector of songs containing the given stem
#'
#' Uses the internal stem_dict which matches all stems found in all Bob Dylan songs to a vector of
#' songs that include them.
#'
#' @param stem a stemmed word
#' @export
#' @family lyric_functions
#' @return a vector of song names that include stem

get_stem_match <- function(stem) stem_dict[[stem]]

#' Find all Bob Dylan songs containing given stems
#'
#' Returns a data.frame() of song names and hit counts
#'
#' @param stems a vector of stems, as generated by \code{get_word_stems()}
#' @export
#' @family lyric_functions
#' @return a data.frame with names 'f' and 'n'



match_all_stems <- function(stems) {
  map(stems, get_stem_match) %>%
    unlist() %>%
    factor() %>%
    fct_count() %>%
    arrange(desc(n))
}
kburnham/bobdylanr documentation built on May 6, 2019, 11:47 a.m.