R/word_list.R

#' Raw Word Lists/Frequency Counts
#' 
#' Transcript Apply Raw Word Lists and Frequency Counts by grouping variable(s).
#' 
#' @param text.var The text variable.
#' @param grouping.var The grouping variables.  Default NULL generates one word 
#' list for all text.  Also takes a single grouping variable or a list of 1 or 
#' more grouping variables.
#' @param stopwords A vector of stop words to remove.
#' @param alphabetical If TRUE the output of frequency lists is ordered 
#' alphabetically.  If FALSE the list is ordered by frequency rank.
#' @param cut.n Cut off point for reduced frequency stop word list (rfswl).
#' @param cap logical. If TRUE capitalizes words from the cap.list.
#' @param cap.list Vector of words to capitalize.
#' @param cap.I logical. If TRUE capitalizes words containing the personal 
#' pronoun I.
#' @param rm.bracket logical If TRUE all brackets and bracketed text are 
#' removed from analysis.
#' @param char.keep A character vector of symbols (i.e. punctuation) that 
#' \code{word_list} should keep.  The default is to remove every symbol except 
#' apostrophes.
#' @param apostrophe.remove logical.  If TRUE removes apostrophes from the 
#' output.
#' @param \ldots Other arguments passed to \code{\link[qdap]{strip}}.
#' @return An object of class \code{"word_list"} is a list of lists of verctors 
#' or dataframes containing the following components: 
#' \item{cwl}{complete word list; raw words}
#' \item{swl}{stop word list; same as rwl with stop words removed}
#' \item{fwl}{frequency word list; a data frame of words and corresponding 
#' frequency counts}
#' \item{fswl}{frequency stopword word list; same as fwl but with stopwords 
#' removed}
#' \item{rfswl}{reduced frequency stopword word list; same as fswl but truncated 
#' to n rows}
#' @keywords word-list
#' @export
#' @examples
#' word_list(raj.act.1$dialogue)
#' 
#' out1 <- with(raj, word_list(text.var = dialogue, 
#'     grouping.var = list(person, act)))
#' names(out1)
#' lapply(out1$cwl, "[", 1:5)
#' 
#' with(DATA, word_list(state, person))
#' with(DATA, word_list(state, person, stopwords = Top25Words))
#' with(DATA, word_list(state, person, cap = FALSE, cap.list=c("do", "we")))
word_list <- 
function(text.var, grouping.var = NULL, stopwords = NULL, alphabetical = FALSE,
    cut.n = 20, cap = TRUE, cap.list=NULL, cap.I=TRUE, rm.bracket = TRUE,
    char.keep = NULL, apostrophe.remove = FALSE, ...) {
    upper <- function(x) paste(substring(x, 1, 1), 
        substring(x, 2, nchar(x)), sep="")
    Sw1 <- stopwords[!substring(stopwords, 1, 1) %in% LETTERS]
    Sw2 <- stopwords[substring(stopwords, 1, 1) %in% LETTERS]
    Sw3 <- if (!is.null(Sw2)) {
        paste0(Sw2, "'s")
    } else {
        NULL
    }
    stopwords <- c(Sw1, upper(Sw2), Sw2, tolower(Sw2), Sw3, tolower(Sw3))
    if(is.null(grouping.var)) {
        dat <- as.data.frame(rep("all", length(text.var)), 
            drop = FALSE, stringsAsFactors = FALSE)
        names(dat) <- "all"
        grouping.var <- dat
    }
    group.var <- if (is.list(grouping.var) & length(grouping.var)>1) {
        apply(data.frame(grouping.var), 1, function(x){
                if (any(is.na(x))){
                    NA
                } else {
                    paste(x, collapse = ".")
                }
            }
        )
    } else {
        grouping.var
    }   
    NAME <- if (is.list(grouping.var)) {
        m <- unlist(as.character(substitute(grouping.var))[-1])
        m <- sapply(strsplit(m, "$", fixed=TRUE), 
            function(x) x[length(x)], USE.NAMES = FALSE)
            paste(m, collapse="&")
    } else {
        G <- as.character(substitute(grouping.var))
        G[length(G)]
    }
    word_lists1 <- textLISTER(text.var = text.var, group.vars = group.var,
        rm.bracket = rm.bracket, char.keep = char.keep, apostrophe.remove = apostrophe.remove, ...)
    words.UNLISTED <- lapply(word_lists1, function(x) {
            y <- unlist(x)
            names(y) <- NULL
            return(y)
        }
    ) 
    if (cap) { 
        word_lists2 <- lapply(word_lists1, function(x) {
                y <- capitalizer(x, caps.list=cap.list)
                names(y) <- NULL
                return(y)
            }
        )    
    } else {
        word_lists2 <- lapply(word_lists1, function(x) {
                y <- unlist(x)
                names(y) <- NULL
                return(y)
            }
        ) 
    }
    naomit <- function(x) x[!is.na(x)]
    word_lists2 <- lapply(word_lists2, naomit)
    stopped.word_list <- lapply(words.UNLISTED, function(x) {
            x[!x %in% stopwords]
        }
    )
    stopped.word_list <- lapply(stopped.word_list, naomit)
    stopped.word_list <- lapply(stopped.word_list, function(x){ 
            capitalizer(x, caps.list = cap.list)
        }
    )
    COUNT <- function(x) {
        if (is.null(x) | identical(x, character(0)) | 
            identical(x, logical(0))) {
            DF <- data.frame(WORD=NA, FREQ=NA)
        } else {
            DF <- data.frame(table(x))
            names(DF) <- c("WORD", "FREQ")
            DF$WORD <- as.character(DF$WORD)
            DF$FREQ <- as.numeric(DF$FREQ)
            DF <- DF[order(-DF$FREQ, DF$WORD), ]
            rownames(DF) <- NULL
        }
        return(DF)
    }
    freq.word_list <- lapply(word_lists2, COUNT)
    freq.stop.word_list <- lapply(stopped.word_list, COUNT)
    red.freq.stop.word_list <- ncutWORDS(freq.stop.word_list, cut.n = cut.n) 
    word_lists2 <- lapply(word_lists2, function(x) {
            if (is.null(x)){
                return(x)
            } else { 
                comment(x) <- "bagOwords"
                return(x)
            }
        }
    )
    stopped.word_list <- lapply(stopped.word_list, function(x) {
        if (is.null(x)){
                return(x)
        } else { 
                comment(x) <- "bagOwords"
                return(x)
            }
        }
    )
    freq.word_list <- lapply(freq.word_list, function(x) {
            if (is.null(x)) {
                return(x)
            } else { 
                comment(x) <- "freqList"
                return(x)
            }
        }
    )
    freq.stop.word_list <- lapply(freq.stop.word_list, function(x) {
            if (is.null(x)) {
                return(x)
            } else { 
                comment(x) <- "freqList"
                return(x)
            }
        }
    )
    red.freq.stop.word_list <- lapply(red.freq.stop.word_list, function(x) {
            if (is.null(x)){
                return(x)
            } else { 
                comment(x) <- "freqList"
                return(x)
            }
        }
    )
    comment(word_lists2) <- "cwl"    
    comment(stopped.word_list) <- "swl"
    comment(freq.word_list) <- "fwl" 
    comment(freq.stop.word_list) <- "fswl"
    comment(red.freq.stop.word_list) <- "rfswl"
    if (alphabetical) {
        asort <- function(dat, col=1) {
            dat2 <-dat[order(dat[, col]), ]
            rownames(dat2) <- NULL
            return(dat2)
        }
        freq.word_list <- lapply(freq.word_list, asort)
        freq.stop.word_list <- lapply(freq.stop.word_list, asort)
        red.freq.stop.word_list <- lapply(red.freq.stop.word_list, asort)
    } 
    o <- list(cwl = word_lists2, swl = stopped.word_list, 
        fwl = freq.word_list, fswl = freq.stop.word_list, 
        rfswl = red.freq.stop.word_list)
    class(o) <- c("word_list", "data.frame")
    return(o)
}

#' Prints a word_list Object
#' 
#' Prints a word_list object.
#' 
#' @param x The word_list object
#' @param \ldots ignored
#' @method print word_list
#' @S3method print word_list
print.word_list <-
function(x, ...) {
    print(x$rfswl)
}
trinker/qdap2 documentation built on May 31, 2019, 9:47 p.m.