R/bootstrap_dfm.R

Defines functions bootstrap_dfm.dfm bootstrap_dfm.character bootstrap_dfm.corpus bootstrap_dfm.default bootstrap_dfm

Documented in bootstrap_dfm

#' Bootstrap a dfm
#'
#' Create an array of resampled dfms.
#' @param x a character or [corpus] object
#' @param n number of resamples
#' @param ... additional arguments passed to [dfm()]
#' @param verbose if `TRUE` print status messages
#' @details Function produces multiple, resampled [dfm] objects, based on
#'   resampling sentences (with replacement) from each document, recombining
#'   these into new "documents" and computing a dfm for each. Resampling of
#'   sentences is done strictly within document, so that every resampled
#'   document will contain at least some of its original tokens.
#' @return A named list of [dfm] objects, where the first, `dfm_0`, is
#'   the dfm from the original texts, and subsequent elements are the
#'   sentence-resampled dfms.
#' @author Kenneth Benoit
#' @export
#' @keywords dfm experimental bootstrap
#' @examples
#' # bootstrapping from the original text
#' set.seed(10)
#' txt <- c(textone = "This is a sentence.  Another sentence.  Yet another.",
#'          texttwo = "Premiere phrase.  Deuxieme phrase.")
#' bootstrap_dfm(txt, n = 3, verbose = TRUE)
#'
bootstrap_dfm <- function(x, n = 10, ..., verbose = quanteda_options("verbose")) {
    UseMethod("bootstrap_dfm")
}

#' @export
bootstrap_dfm.default <- function(x, n = 10, ..., verbose = quanteda_options("verbose")) {
    check_class(class(x), "bootstrap_dfm")
}

#' @noRd
#' @importFrom stringi stri_replace_all_fixed
#' @export
bootstrap_dfm.corpus <- function(x, n = 10, ..., verbose = quanteda_options("verbose")) {
    if (verbose)
        message("Segmenting the ",
                stri_replace_all_fixed(as.character(sys.calls()[2][[1]])[1],
                                       "bootstrap_dfm.", ""),
                " into sentences...", appendLF = FALSE)
    x <- as.corpus(x)
    x <- corpus_reshape(x, to = "sentences")
    if (verbose) message("done.")
    bootstrap_dfm(suppressWarnings(dfm(x, ...)),  n = n, ..., verbose = verbose)
}

#' @noRd
#' @export
bootstrap_dfm.character <- function(x, n = 10, ..., verbose = quanteda_options("verbose")) {
    bootstrap_dfm(corpus(x), n = n, ..., verbose = verbose)
}

#' @noRd
#' @export
#' @examples
#' # bootstrapping from a dfm
#' dfmat <- corpus_reshape(corpus(txt), to = "sentences") %>%
#'     tokens() %>%
#'     dfm()
#' bootstrap_dfm(dfmat, n = 3)
bootstrap_dfm.dfm <- function(x, n = 10, ..., verbose = quanteda_options("verbose")) {
    x <- as.dfm(x)
    n <- check_integer(n, min = 0)
    verbose <- check_logical(verbose)

    if (verbose) {
        message("Bootstrapping the sentences to create multiple dfm objects...")
        message("   ...resampling and forming dfms: 0", appendLF = FALSE)
    }
    result <- list()
    result[[1]] <- dfm_group(x, groups = docid(x), fill = TRUE)
    for (i in seq_len(n)) {
        if (verbose)
            message(", ", i, appendLF = FALSE)
        temp <- dfm_sample(x, size = NULL, replace = TRUE, by = docid(x))
        temp <- dfm_group(temp)
        result[[i + 1]] <- temp
    }
    names(result) <- paste0("dfm_", seq(0, n))
    if (verbose)
        message("\n   ...complete.\n")
    class(result) <- c("dfm_bootstrap")
    return(result)
}

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.