R/preText.R

Defines functions preText

Documented in preText

#' @title preText Test
#' @description Calculates preText scores for each preprocessing specification.
#'
#' @param preprocessed_documents A list object generated by the
#' `factorial_preprocessing()` function.
#' @param dataset_name A string indicating the name to be associated with the
#' results. Defaults to "Documents".
#' @param distance_method The method that should be used for calculating
#' document distances. Defaults to "cosine".
#' @param num_comparisons If method = "distribution", the number of ranks to use
#' in calculating average difference. Defaults to 50.
#' @param parallel Logical indicating whether factorial preprocessing should be
#' performed in parallel. Defaults to FALSE.
#' @param cores Defaults to 1, can be set to any number less than or equal to
#' the number of cores on one's computer
#' @param verbose Logical indicating whether more information should be printed
#' to the screen to let the user know about progress. Defaults to TRUE.
#' @return A result list object.
#' @examples
#' \dontrun{
#' # load the package
#' library(preText)
#' # load in the data
#' data("UK_Manifestos")
#' # preprocess data
#' preprocessed_documents <- factorial_preprocessing(
#'     UK_Manifestos,
#'     use_ngrams = TRUE,
#'     infrequent_term_threshold = 0.02,
#'     verbose = TRUE)
#' # run preText
#' preText_results <- preText(
#'     preprocessed_documents,
#'     dataset_name = "Inaugural Speeches",
#'     distance_method = "cosine",
#'     num_comparisons = 100,
#'     verbose = TRUE)
#' }
#' @export
preText <- function(preprocessed_documents,
                    dataset_name = "Documents",
                    distance_method = "cosine",
                    num_comparisons = 50,
                    parallel = FALSE,
                    cores = 1,
                    verbose = TRUE){

    ptm <- proc.time()
    # extract teh dfm object list from preprocessed_documents
    dfm_object_list <- preprocessed_documents$dfm_list

    cat("Generating document distances...\n")
    # get document distances
    scaling_results <- scaling_comparison(dfm_object_list,
                                          dimensions = 2,
                                          distance_method = distance_method,
                                          verbose = verbose,
                                          cores = cores)

    # extract distance matrices
    distance_matrices <- scaling_results$distance_matrices
    cat("Generating preText Scores...\n")

    preText_results <- preText_test(
        distance_matrices,
        choices = preprocessed_documents$choices,
        labels = preprocessed_documents$labels,
        baseline_index = length(preprocessed_documents$labels),
        text_size = 1,
        num_comparisons = num_comparisons,
        parallel = parallel,
        cores = cores,
        verbose = verbose)

    preText_scores <- preText_results$dfm_level_results_unordered
    cat("Generating regression results..\n")

    reg_results <- preprocessing_choice_regression(
        Y = preText_scores$preText_score,
        choices = preprocessed_documents$choices,
        dataset = dataset_name,
        base_case_index = length(preprocessed_documents$labels))

    cat("Regression results (negative coefficients imply less risk):\n")
    # create temporary results os we can round coefficients
    reg_results2 <- reg_results
    reg_results2[,1] <- round(reg_results2[,1],3)
    reg_results2[,2] <- round(reg_results2[,2],3)
    print(reg_results2[,c(3,1,2)])

    t2 <- proc.time() - ptm
    cat("Complete in:",t2[[3]],"seconds...\n")
    #extract relevant info
    return(list(preText_scores = preText_scores,
                ranked_preText_scores = preText_results$dfm_level_results,
                choices = preprocessed_documents$choices,
                regression_results = reg_results))

}
matthewjdenny/preText documentation built on July 27, 2021, 1:18 a.m.