#' @title Analyze
#' @description Analyzes the dataset generated by generate_data according to
#' specified method.
#' @param data_to_analyze A data frame with a user-specified number of features
#' to analyze repeated reps number of times. The output from the generate_data
#' function.
#' @param method A user-specified method of analysis. Choose one of "ofaat", "mv_glm",
#' or "lasso". Currently can accommodate one feature at a time hypothesis
#' testing (use "ofaat" argument), fitting all features to a general linear
#' model (use "mv_glm" argument), or a lasso method implemented with glmnet
#' functions (use "lasso" argument). Default: NULL
#' @param p_adjust The method by which p value adjustment will take place.
#' See ?p.adjust for list of possible arguments. Default: NULL
#' @return Dataframe of analyzed results. The structure of the datafrane varies
#' depending on the argument passed to "method".
#' @details Must set global_alpha with set.alpha(alpha_value) where alpha_value
#' is the desired cut-off for significance testing before this function will
#' run to completion.
#'
#' Note that different analysis methods are appropriate for different dataset
#' shapes. Each analysis method has a slightly different return dataframe. This
#' method is designed to be used inside the simulation.
#' @examples
#'
#' set.alpha(0.05)
#' example <- generate_data(50, 50, c(1,2,3), c(0.3, 0, 0.7))
#' analyzed1 <- analyze(example, method="ofaat", p_adjust="bonferroni")
#' analyzed2 <- analyze(example, method="mv_glm")
#' analyzed3 <- analyze(example, method="lasso")
#'
#'
#' @rdname analyze
#' @export
analyze <- function(data_to_analyze, method = NULL, p_adjust = NULL) {
if (method != "ofaat" & method != "mv_glm" & method != "lasso") {
stop("A valid approach for analyzing data has not been specified")
}
if (method == "ofaat") {
# fit each gene to a linear model (univariate)
analyzed <- data_to_analyze %>% group_by(reps, n) %>% nest() %>%
mutate(model = map(data, ~tidy_glm_single(p_adj = p_adjust,
data = .))) %>% unnest(model)
analyzed$selected <- ifelse(analyzed$p.value <= global_alpha,
1, 0)
# modify 'selected' column to fit with summary later on
} else if (method == "mv_glm") {
# fit each gene to a linear model (multivariate)
analyzed <- data_to_analyze %>% group_by(reps, n) %>% nest() %>%
mutate(model = map(data, ~tidy_glm_mv(p_adj = p_adjust,
data = .))) %>% unnest(model)
analyzed$selected <- ifelse(analyzed$p.value <= global_alpha,
1, 0)
analyzed <- analyzed %>% mutate(selected = ifelse(term ==
"Model", p.value, selected)) %>% mutate(selected = ifelse(term ==
"AUC", p.value, selected))
} else if (method == "lasso") {
analyzed <- data_to_analyze %>% group_by(reps) %>% nest() %>%
mutate(model = map(data, ~tidy_glmnet(data = .))) %>%
unnest(model)
}
return(analyzed)
}
#' @title Lasso analysis
#' @description Uses glmnet to perform analysis and calculates AUC.
#' @param data_tibble data to be analyzed.
#' @return A 3 column dataframe that tracks the reps number, feature, and
#' whether that feature was selected as significant or not (either a 1 or 0).
#' 'selected' column also stores the calculated AUC before the summarization
#' step.
#' @details An internal function.
#' @seealso
#' \code{\link[dplyr]{select}}
#' @rdname tidy_glmnet
#' @importFrom dplyr select
tidy_glmnet <- function(data_tibble) {
genes <- as.matrix(dplyr::select(data_tibble, contains("Gene_")))
y <- unlist(dplyr::select(data_tibble, treatment))
cvfit <- cv.glmnet(genes, y, family = "binomial", type.measure = "auc",
nfolds = 5)
results <- as.matrix(coef(cvfit, s = "lambda.min"))
# as.matrix used here rather than tidy in order to get all 0
# coefficients for summary later
results <- as.data.frame(t(results)) %>% gather(term, selected)
results$selected <- ifelse(results$selected == 0, 0, 1)
# getting auc
fittedval <- predict(cvfit, genes, type = "response", s = "lambda.min")
pred <- prediction(fittedval, y)
auc <- performance(pred, "auc")@y.values[[1]]
results <- rbind(auc, results)
results$term[[1]] <- "AUC"
# add Model row for future summarization compatibility
results <- rbind(NA, results)
results$term[[1]] <- "Model"
return(results)
}
#' @title Fit all genes to a general linear model
#' @description An internal function. Called by analyze when "mv_glm" argument
#' is passed.
#' @param p_adjust p value adjustment argument, used to call p.adjust Default: NULL
#' @param data_tibble The data to be analyzed
#' @return Returns a dataframe with many columns that are not used in the
#' summarization step (such as estimate, std.error, statistic, p.value). Main
#' columns of importance are reps, term, and selected. The "mv_glm" argument
#' allows a model p.value to be calculated as well.
#' @seealso
#' \code{\link[dplyr]{select}}
#' @rdname tidy_glm_mv
#' @importFrom dplyr select
tidy_glm_mv <- function(p_adjust = NULL, data_tibble) {
models <- data_tibble %>% dplyr::select(contains("Gene_"), treatment)
formula <- reformulate(setdiff(colnames(models), "treatment"),
response = "treatment")
model <- glm(formula, family = "binomial", models)
model0 <- glm(treatment ~ 1, family = "binomial", models)
output_pval <- anova(model0, model, test = "Chisq")
# fussy tidy function rife with inconsequential warnings otherwise
output_pval <- tidy(output_pval)
# get AUC
fittedval <- predict(model, dplyr::select(models, contains("Gene_")),
type = "response")
pred <- prediction(fittedval, dplyr::select(data_tibble, treatment))
auc <- performance(pred, "auc")@y.values[[1]]
model <- tidy(model)
if (!is.null(p_adjust)) {
model$p.value <- p.adjust(model$p.value, method = p_adjust)
}
model <- rbind(output_pval$p.value[[2]], model)
model$term[[1]] <- "Model"
model <- rbind(auc, model)
model$term[[1]] <- "AUC"
models <- model
return(models)
}
#' @title A 'one feature at a time' analysis
#' @description An internal function, called when "ofaat" is the argument
#' supplied to 'analyze' method.
#' @param p_adjust passed to p.adjust for p value adjustment Default: NULL
#' @param data_tibble data to analyze
#' @return Returns a dataframe with many columns that are not used in the
#' summarization step (such as estimate, std.error, statistic, p.value). Main
#' columns of importance are reps, term, and selected.
#' @seealso
#' \code{\link[dplyr]{select}}
#' @rdname tidy_glm_single
#' @importFrom dplyr select
tidy_glm_single <- function(p_adjust = NULL, data_tibble) {
models <- data_tibble %>% gather(key = gene, value = gene_expression,
-sample, -treatment) %>% group_by(gene) %>% nest() %>% mutate(model = map(data,
~glm(treatment ~ gene_expression, family = "binomial", data = .))) %>%
mutate(tidy_ = map(model, ~tidy(.))) %>% unnest(tidy_)
models <- models %>% filter(term == "gene_expression") %>% dplyr::select(-term) %>%
rename(term = gene)
if (!is.null(p_adjust)) {
models$p.value <- p.adjust(models$p.value, method = p_adjust)
}
return(models)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.