R/analyzingData.R

#' @title Analyze
#' @description Analyzes the dataset generated by generate_data according to
#' specified method.
#' @param data_to_analyze A data frame with a user-specified number of features
#' to analyze repeated reps number of times. The output from the generate_data
#' function.
#' @param method A user-specified method of analysis. Choose one of "ofaat", "mv_glm",
#' or "lasso". Currently can accommodate one feature at a time hypothesis
#' testing (use "ofaat" argument), fitting all features to a general linear
#' model (use "mv_glm" argument), or a lasso method implemented with glmnet
#' functions (use "lasso" argument). Default: NULL
#' @param p_adjust The method by which p value adjustment will take place.
#' See ?p.adjust for list of possible arguments. Default: NULL
#' @return Dataframe of analyzed results. The structure of the datafrane varies
#' depending on the argument passed to "method".
#' @details Must set global_alpha with set.alpha(alpha_value) where alpha_value
#' is the desired cut-off for significance testing before this function will
#' run to completion.
#'
#' Note that different analysis methods are appropriate for different dataset
#' shapes. Each analysis method has a slightly different return dataframe. This
#' method is designed to be used inside the simulation.
#' @examples
#' 
#'  set.alpha(0.05)
#'  example <- generate_data(50, 50, c(1,2,3), c(0.3, 0, 0.7))
#'  analyzed1 <- analyze(example, method="ofaat", p_adjust="bonferroni")
#'  analyzed2 <- analyze(example, method="mv_glm")
#'  analyzed3 <- analyze(example, method="lasso")
#'  
#' 
#' @rdname analyze
#' @export
analyze <- function(data_to_analyze, method = NULL, p_adjust = NULL) {

    if (method != "ofaat" & method != "mv_glm" & method != "lasso") {
        stop("A valid approach for analyzing data has not been specified")
    }

    if (method == "ofaat") {
        # fit each gene to a linear model (univariate)
        analyzed <- data_to_analyze %>% group_by(reps, n) %>% nest() %>%
            mutate(model = map(data, ~tidy_glm_single(p_adj = p_adjust,
                data = .))) %>% unnest(model)
        analyzed$selected <- ifelse(analyzed$p.value <= global_alpha,
            1, 0)
        # modify 'selected' column to fit with summary later on
    } else if (method == "mv_glm") {
        # fit each gene to a linear model (multivariate)
        analyzed <- data_to_analyze %>% group_by(reps, n) %>% nest() %>%
            mutate(model = map(data, ~tidy_glm_mv(p_adj = p_adjust,
                data = .))) %>% unnest(model)
        analyzed$selected <- ifelse(analyzed$p.value <= global_alpha,
            1, 0)
        analyzed <- analyzed %>% mutate(selected = ifelse(term ==
            "Model", p.value, selected)) %>% mutate(selected = ifelse(term ==
            "AUC", p.value, selected))
    } else if (method == "lasso") {
        analyzed <- data_to_analyze %>% group_by(reps) %>% nest() %>%
            mutate(model = map(data, ~tidy_glmnet(data = .))) %>%
            unnest(model)
    }
    return(analyzed)
}

#' @title Lasso analysis
#' @description Uses glmnet to perform analysis and calculates AUC.
#' @param data_tibble data to be analyzed.
#' @return A 3 column dataframe that tracks the reps number, feature, and
#' whether that feature was selected as significant or not (either a 1 or 0).
#' 'selected' column also stores the calculated AUC before the summarization
#' step.
#' @details An internal function.
#' @seealso
#'  \code{\link[dplyr]{select}}
#' @rdname tidy_glmnet
#' @importFrom dplyr select
tidy_glmnet <- function(data_tibble) {
    genes <- as.matrix(dplyr::select(data_tibble, contains("Gene_")))
    y <- unlist(dplyr::select(data_tibble, treatment))
    cvfit <- cv.glmnet(genes, y, family = "binomial", type.measure = "auc",
        nfolds = 5)
    results <- as.matrix(coef(cvfit, s = "lambda.min"))
    # as.matrix used here rather than tidy in order to get all 0
    # coefficients for summary later
    results <- as.data.frame(t(results)) %>% gather(term, selected)
    results$selected <- ifelse(results$selected == 0, 0, 1)
    # getting auc
    fittedval <- predict(cvfit, genes, type = "response", s = "lambda.min")
    pred <- prediction(fittedval, y)
    auc <- performance(pred, "auc")@y.values[[1]]
    results <- rbind(auc, results)
    results$term[[1]] <- "AUC"

    # add Model row for future summarization compatibility
    results <- rbind(NA, results)
    results$term[[1]] <- "Model"

    return(results)
}

#' @title Fit all genes to a general linear model
#' @description An internal function. Called by analyze when "mv_glm" argument
#' is passed.
#' @param p_adjust p value adjustment argument, used to call p.adjust Default: NULL
#' @param data_tibble The data to be analyzed
#' @return Returns a dataframe with many columns that are not used in the
#' summarization step (such as estimate, std.error, statistic, p.value). Main
#' columns of importance are reps, term, and selected. The "mv_glm" argument
#' allows a model p.value to be calculated as well.
#' @seealso
#'  \code{\link[dplyr]{select}}
#' @rdname tidy_glm_mv
#' @importFrom dplyr select
tidy_glm_mv <- function(p_adjust = NULL, data_tibble) {

    models <- data_tibble %>% dplyr::select(contains("Gene_"), treatment)
    formula <- reformulate(setdiff(colnames(models), "treatment"),
        response = "treatment")
    model <- glm(formula, family = "binomial", models)
    model0 <- glm(treatment ~ 1, family = "binomial", models)

    output_pval <- anova(model0, model, test = "Chisq")
    # fussy tidy function rife with inconsequential warnings otherwise
    output_pval <- tidy(output_pval)
    # get AUC
    fittedval <- predict(model, dplyr::select(models, contains("Gene_")),
        type = "response")
    pred <- prediction(fittedval, dplyr::select(data_tibble, treatment))
    auc <- performance(pred, "auc")@y.values[[1]]

    model <- tidy(model)
    if (!is.null(p_adjust)) {
        model$p.value <- p.adjust(model$p.value, method = p_adjust)
    }
    model <- rbind(output_pval$p.value[[2]], model)
    model$term[[1]] <- "Model"

    model <- rbind(auc, model)
    model$term[[1]] <- "AUC"

    models <- model

    return(models)
}

#' @title A 'one feature at a time' analysis
#' @description An internal function, called when "ofaat" is the argument
#' supplied to 'analyze' method.
#' @param p_adjust passed to p.adjust for p value adjustment Default: NULL
#' @param data_tibble data to analyze
#' @return  Returns a dataframe with many columns that are not used in the
#' summarization step (such as estimate, std.error, statistic, p.value). Main
#' columns of importance are reps, term, and selected.
#' @seealso
#'  \code{\link[dplyr]{select}}
#' @rdname tidy_glm_single
#' @importFrom dplyr select
tidy_glm_single <- function(p_adjust = NULL, data_tibble) {

    models <- data_tibble %>% gather(key = gene, value = gene_expression,
        -sample, -treatment) %>% group_by(gene) %>% nest() %>% mutate(model = map(data,
        ~glm(treatment ~ gene_expression, family = "binomial", data = .))) %>%
        mutate(tidy_ = map(model, ~tidy(.))) %>% unnest(tidy_)
    models <- models %>% filter(term == "gene_expression") %>% dplyr::select(-term) %>%
        rename(term = gene)

    if (!is.null(p_adjust)) {
        models$p.value <- p.adjust(models$p.value, method = p_adjust)
    }
    return(models)
}
emartchenko/mvsimstudy documentation built on May 24, 2019, 5:04 a.m.