#' @import dplyr
#' @export run_dge
#' @export run_dgsva
#' @export run_ma
#' @export prepare_for_dge
prepare_for_dge <- function (datasets, dataset_names, contrast, covariates = NULL, scrutinize_covariates = FALSE, blocks = NULL, scrutinize_blocks = FALSE, sample_filters = NULL, min_num_datasets_to_use_gene = 1, min_genes_to_keep_dataset = 5, min_samples_per_level = 2, fix_NA = c("none", "mean")) {
#' Remember to pre-merge any synonymous contrast levels before using OMA!
#'
#' datasets - this needs to ba a list of dataset objects.
#' dataset_names - this needs to ba a vector of dataset names - same order as datasets above.
#' contrast - a three-member list, where the first filed is the name of the variable to use from "pheno", second is
#' the name of the active level in the contrast, and the third is the name of the reference level.
#' sample_filters - a quosure of filters; previously this was a string that would be passed into dplyr's `filter_()`;
#' however, `filter_()` was deprecated, and this was quite error-prone anyway, as it required a double qotations in
#' and similar risky business in the case of strings. So now a full expression is expected that goes directly into
#' dplyr's `filter()` - send it in inside `rlang::quo()` which will make it a quosure.
#' response_variable - informs which variable is the one to use as a contrast
if (!is.null(sample_filters) & !rlang::is_quosure(sample_filters)) { stop("`sample_filters` needs to be a quosure!") }
if (!is.list(contrast) | (length(contrast) != 3) | !all(names(contrast) == c("variable", "active", "reference"))) { stop("`contrast` needs to be a 3-value list, with the following properties: \"variable\" (the name of the contrast variable in the data), \"active\" (the active level(s) of the contrast), \"reference\" (reference level(s) of the contrast).") }
if (!is.null(covariates) & !is.list(covariates) & !is.vector(covariates)) { stop("`covariates` needs to be a list with per-dataset covariates, or a vector of covariates that will be used across all datasets!") }
if (is.list(covariates) & !all(names(covariates) %in% dataset_names)) { stop("`covariates` needs to be a list with names corresponding to dataset names!") }
if (!is.null(blocks) & !is.list(blocks) & !is.vector(blocks)) { stop("`blocks` needs to be a list with per-dataset blocks, or a vector of blocks that will be used across all datasets!") }
if (is.list(blocks) & !all(names(blocks) %in% dataset_names)) { stop("`blocks` needs to be a list with names corresponding to dataset names!") }
if (is.list(blocks) & !all(sapply(blocks, function (x) length(x) == 1))) { stop("`blocks` needs to be a list with only ONE variable name per dataset!") }
if(is.vector(blocks) & (length(blocks) != 1)) { stop("If `blocks` is a vector, than it needs to be a vector of length 1 (only a single variable can be used to identify duplicates).") }
lapply(names(datasets), function (dat_nam) {
dat <- datasets[[dat_nam]]
if (!all(names(dat) %in% c("gene_expression_data", "sample_data"))) { stop(paste0("Each dataset needs to be a list with two dataframes, one for gene expression (a named list element called \"gene_expression_data\"), and sample data (another named list element called \"sample_data\"). This is not the case in ", dat_nam)) }
if (!is.data.frame(dat$gene_expression_data) & !is.matrix(dat$gene_expression_data)) { stop(paste0("A \"gene_expression_data\" field is present in ", dat_nam, ", but it's not a data frame or matrix.")) }
if (!is.data.frame(dat$sample_data)) { stop(paste0("A \"sample_data\" field is present in ", dat_nam, ", but it's not a data frame.")) }
if (!("sample_name" %in% colnames(dat$sample_data))) { stop(paste0("The \"sample_data\" dataframe needs to have a \"sample_name\" column corresponding to the rownames in the expression data. This column wasn't found in ", dat_nam, "!")) }
})
## testing
# contrast <- list(
# variable = "disease_state",
# active = "atopic dermatitis",
# reference = "normal control"
# )
response_variable <- contrast[["variable"]]
active_levels <- contrast[["active"]]
reference_levels <- contrast[["reference"]]
fix_NA <- match.arg(fix_NA)
genes_to_use <- sapply(datasets, function(x) colnames(x$gene_expression_data)) %>% unlist %>% unname %>% table
genes_to_use <- names(genes_to_use[genes_to_use >= min_num_datasets_to_use_gene])
## testing
# covariates <- "gender"
if (is.vector(covariates)) {
covariates <- rep(list(covariates), length(dataset_names))
names(covariates) <- dataset_names
}
if (!is.null(covariates) & scrutinize_covariates) {
covariates <- lapply(1:length(datasets), function (d_num) {
remaining_covariates <- intersect(covariates[[d_num]], colnames(datasets[[d_num]]$sample_data))
if (length(remaining_covariates) > 0)
remaining_covariates <- remaining_covariates[remaining_covariates %>% sapply(function (x) datasets[[d_num]]$sample_data %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(x) %>% as.character %>% unique %>% length >= 2)]
if (length(remaining_covariates) == 0)
remaining_covariates <- NULL
return(remaining_covariates)
})
names(covariates) <- dataset_names
}
if (is.vector(blocks)) {
blocks <- rep(list(blocks), length(dataset_names))
names(blocks) <- dataset_names
}
if (!is.null(blocks) & scrutinize_blocks) {
blocks <- lapply(1:length(datasets), function (d_num) {
remaining_blocks <- intersect(blocks[[d_num]], colnames(datasets[[d_num]]$sample_data))
if (length(remaining_blocks) > 0)
remaining_blocks <- remaining_blocks[remaining_blocks %>% sapply(function (x) datasets[[d_num]]$sample_data %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(x) %>% as.character %>% unique %>% length >= 2)]
if (length(remaining_blocks) == 0)
remaining_blocks <- NULL
return(remaining_blocks)
})
names(blocks) <- dataset_names
}
dge_info <- list(
run_date = date(),
min_num_datasets_to_use_gene = min_num_datasets_to_use_gene,
min_genes_to_keep_dataset = min_genes_to_keep_dataset,
min_samples_per_level = min_samples_per_level,
contrast = contrast,
scrutinize_covariates = scrutinize_covariates,
covariates = covariates,
scrutinize_blocks = scrutinize_blocks,
blocks = blocks,
fix_NA = fix_NA,
input_datasets = dataset_names
)
dge_data <- lapply(1:length(datasets), function (d_num) {
dataset_name <- dataset_names[d_num]
d <- datasets[[d_num]]
if (!all(covariates[[dataset_name]] %in% colnames(d$sample_data))) { stop(paste0("Covariates specified for dataset ", dataset_name, " that don't exist in its pheno data")) }
if (!all(blocks[[dataset_name]] %in% colnames(d$sample_data))) { stop(paste0("`blocks` specified for dataset ", dataset_name, " that don't exist in its pheno data")) }
cat(paste0("dataset: ", dataset_name, "\n"))
genes_to_use_this_dataset <- intersect(genes_to_use, colnames(d$gene_expression_data))
data_obj <- list()
data_obj$status <- list(
NA_present = FALSE,
less_than_min_genes = length(genes_to_use_this_dataset) < min_genes_to_keep_dataset,
expr_dims_eq_0 = c(FALSE, FALSE),
keep = TRUE
)
data_obj$pheno <- d$sample_data
data_obj$covariates <- covariates[[dataset_name]]
data_obj$block <- blocks[[dataset_name]]
print("data_obj$status:")
print(data_obj$status)
## Checking if pheno meets the criteria --------------------------------- ##
if (!is.null(sample_filters))
data_obj[["pheno"]] <- data_obj[["pheno"]] %>% filter(!!sample_filters) %>% as.data.frame
data_obj[["pheno"]][, response_variable] <- factor(data_obj[["pheno"]][, response_variable])
print(data_obj[["pheno"]][, c("sample_name", "tissue", "disease_state", "cell_type", "sample_pathology")] %>% head)
c_n <- unname(unlist(data_obj[["pheno"]]$sample_name))
rownames(data_obj[["pheno"]]) <- c_n
print(levels(as.factor(unname(unlist(data_obj[["pheno"]][, response_variable])))))
print("response variable bit")
print(data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character %>% unique)
data_obj$status$less_than_2_response_levels <- data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character %>% unique %>% length < 2
print("less than 2 required levels?")
print(data_obj$status$less_than_2_response_levels)
data_obj$status$less_than_min_samples_per_level <- (function () {
tmp_dat <- data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character
lvls <- tmp_dat %>% unique
sapply(lvls, function (lvl) {
length(tmp_dat[tmp_dat == lvl]) < min_samples_per_level
}) %>% unlist %>% unname %>% any
})()
if (data_obj$status$less_than_min_genes) {
cat(paste0("Less than ", min_genes_to_keep_dataset, " viable genes in ", dataset_name, "; eliminating from further analysis!\n"))
}
if (data_obj$status$less_than_2_response_levels) {
cat(paste0("Less than 2 levels present in the response variable! Eliminating dataset ", dataset_name, "\n"))
}
if (data_obj$status$less_than_min_samples_per_level) {
cat(paste0("Less than ", min_samples_per_level, " samples with a given response level present in the response variable! Eliminating dataset ", dataset_name, "\n"))
}
if (data_obj$status$less_than_2_response_levels | data_obj$status$less_than_min_samples_per_level | data_obj$status$less_than_min_genes) {
data_obj$status$keep <- FALSE
return(data_obj)
}
c_n <- rownames(data_obj[["pheno"]])
## ---------------------------------------------------------------------- ##
## Expression data ------------------------------------------------------ ##
d$gene_expression_data <- d$gene_expression_data[, genes_to_use_this_dataset]
data_obj$expr <- d$gene_expression_data %>% as.data.frame
if (any(is.na(data_obj[["expr"]])) | any(is.null(data_obj[["expr"]]))) {
cat("\nNA values present...\n\n")
data_obj$status$NA_present <- TRUE
if (fix_NA == "mean") {
gene_means <- lapply(data_obj[["expr"]], function (col) {
if (any(is.na(col)) | any(is.null(col)))
col %>% mean(na.rm = TRUE)
## col[!is.nan(col)] %>% unlist %>% unname %>% mean(na.rm = TRUE)
else NULL
})
names(gene_means) <- colnames(data_obj[["expr"]])
gene_means[sapply(gene_means, is.null)] <- NULL
gene_means <- unlist(gene_means)
cat(paste0("NAs need fixing in ", length(gene_means), " genes\n"))
# print(gene_means[1:10])
cat("fixing NA's: ")
for (gene_num in 1:length(gene_means)) {
gene <- names(gene_means)[gene_num]
if (gene_num %% 100 == 0)
cat(paste0(gene_num, " "))
data_obj[["expr"]][is.na(data_obj[["expr"]][, gene]), gene] <- gene_means[gene]
}
print("")
cat(paste0("Checking if there are any NAs left... ", any(is.na(data_obj[["expr"]])), "\n"))
}
}
r_n <- colnames(data_obj[["expr"]])
## c_n below comes from the names of samples in sample_data
data_obj[["expr"]] <- data_obj[["expr"]][c_n, ] %>% t %>% as.data.frame
colnames(data_obj[["expr"]]) <- c_n
rownames(data_obj[["expr"]]) <- r_n
data_obj[["keys"]] <- rownames(data_obj[["expr"]])
names(data_obj[["keys"]]) <- rownames(data_obj[["expr"]])
data_obj[["dataset_name"]] <- dataset_name
c_n <- colnames(data_obj[["expr"]])
r_n <- rownames(data_obj[["expr"]])
temp <- data_obj[["expr"]] %>% t %>%
as.data.frame %>% lapply(function (col) {
as.numeric(col)
}) %>% do.call(cbind, .) %>% as.matrix %>% t
data_obj[["expr"]] <- temp
colnames(data_obj[["expr"]]) <- c_n
rownames(data_obj[["expr"]]) <- r_n
## ---------------------------------------------------------------------- ##
print(paste0("dim: ", dim(data_obj[["expr"]])))
if (0 %in% dim(data_obj[["expr"]])) {
data_obj$status$expr_dims_eq_0 <- dim(data_obj[["expr"]]) == 0
data_obj$status$keep <- FALSE
}
return(data_obj)
})
names(dge_data) <- dataset_names
dge_objects <- list(
info = dge_info,
dataset_prep_info = dge_data %>% lapply(function (x) x$status)
)
names(dge_objects$dataset_prep_info) <- dataset_names
dge_data[sapply(dge_data, function (x) {x$status$keep == FALSE})] <- NULL # Making sure that 0-length objects aren't retained
dge_objects$data <- dge_data
return(dge_objects)
}
check_ge_obj <- function (ge_obj) {}
run_dge <- function (ge_objects, technology = NULL) {
#' Here using limma for #1.
#'
#' @param `technology` needs to be a vector of "microarray" or "RNA-seq" values corresponding to the technologies in the corresponding members of `ge_objects`. If left at NULL, all datasets in `ge_objects` are assumed to be from microarrays.
#'
#' *Could be passing in a concrete contrast argument instead with a more
#' complex contrast once needed (when/if implementing, remember that these
#' will need to be supplied with `make.names`'d variable names) - for now
#' working with simple contrasts, so don't need to implement this.
#' *This expression should involve all levels from the "active_levels" and
#' "reference_levels" lists.
#'
#' Remember to pre-merge any synonymous levels before using OMA!
if (!is.list(ge_objects)) { stop("`ge_objects` needs to be a list") }
# lapply(ge_objects, check_ge_obj)
if (!is.null(technology) & !all(technology %in% c("microarray", "RNA-seq"))) { stop("`technology` specified not available!") }
if (!is.null(technology) & (length(technology) != length(ge_objects))) { stop("`technology` needs to be specified for each of `ge_objects`!") }
if (is.null(technology))
technology <- rep("microarray", length(ge_objects$data))
ge_info <- ge_objects$info
contrast_info <- ge_info$contrast
ge_results <- lapply(1:length(ge_objects$data), function (d_num) {
active_nicename <- make.names(contrast_info[["active"]])
reference_nicename <- make.names(contrast_info[["reference"]])
contrast_nicename <- paste0(active_nicename, "-", reference_nicename)
tech <- technology[d_num]
ge_data <- ge_objects$data[[d_num]]
ge_data$expr_ready <- ge_data$expr
if (tech == "RNA-seq") {
dlist <- edgeR::DGEList(counts = ge_data$expr_ready, group = ge_data$pheno[, contrast_info$variable])
keep <- edgeR::filterByExpr(dlist, min.count = 0, min.total.count = 10*ncol(ge_data$expr_ready))
dlist <- dlist[keep, , keep.list.sizes = T]
tmm <- edgeR::calcNormFactors(dlist, method = "TMM")
ge_data$expr_ready <- tmm
}
ge_f <- paste0('~ 0 + ge_data[["pheno"]][, "', contrast_info$variable, '"]')
if (!is.null(ge_data$covariates))
ge_f <- paste0(ge_f, " + ", sapply(ge_data$covariates, function (x) paste0('ge_data$pheno[, "', x, '"]')), collapse = " + ")
ge_f <- as.formula(ge_f)
# print(ge_data[["pheno"]][, contrast_info$variable])
# print(contrast_info)
# print(ge_data[["pheno"]][, "gender"])
mm <- model.matrix(ge_f)
colnames(mm)[1:length(levels(ge_data[["pheno"]][, contrast_info$variable]))] <- levels(ge_data[["pheno"]][, contrast_info$variable])
colnames(mm) <- make.names(colnames(mm))
print(colnames(mm))
con <- limma::makeContrasts(contrasts = contrast_nicename, levels = mm)
if (tech == "RNA-seq")
ge_data$expr_ready <- limma::voom(ge_data$expr_ready, mm)
lmfit_partial <- purrr::safely(limma::lmFit) %>% purrr::partial(ge_data$expr_ready, design = mm)
lmfit_realized <- lmfit_partial()$result
if (!is.null(ge_data$block)) {
corfit <- limma::duplicateCorrelation(ge_data$expr_ready, mm, block = ge_data$pheno[, ge_data$block])
lmfit_partial_cor <- lmfit_partial %>% purrr::partial(block = ge_data$pheno[, ge_data$block], correlation = corfit$result$consensus.correlation)
lmfit_partial_cor_realized <- lmfit_partial_cor()
if (is.null(lmfit_partial_cor_realized$error)) {
lmfit_realized <- lmfit_partial_cor_realized$result
}
}
limma_out <- limma::eBayes(
limma::contrasts.fit(
lmfit_realized,
con
)
)
# limma::topTable(limma_out, number = Inf) %>% tibble::rownames_to_column(var = "gene") %>% head
active_n <- ge_data$expr[rownames(limma_out$coefficients), ge_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$active) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
lapply(function (x) {
x[!is.na(x)] %>% length
}) %>% unlist %>% unname
reference_n <- ge_data$expr[rownames(limma_out$coefficients), ge_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$reference) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
lapply(function (x) {
x[!is.na(x)] %>% length
}) %>% unlist %>% unname
return(
tibble(
gene = rownames(limma_out$coefficients),
coeff = limma_out$coefficients %>% as.vector,
p_value = limma_out$p.value %>% as.vector,
cohen_d = limma_out$coefficients[, 1] / sqrt(limma_out$s2.post),
variance = limma_out$s2.post * limma_out$stdev.unscaled[, 1]^2, # According to Gordon Smyth in https://support.bioconductor.org/p/70175/
active_n = active_n,
reference_n = reference_n
)
)
})
names(ge_results) <- names(ge_objects$data)
return(ge_results)
}
run_dgsva <- function (ge_objects, technology = NULL, gene_sets = NULL) {
#' `run_dgsva`, as in "run differential GSVA".
#' Here using gsva and limma.
#'
#' @param `technology` needs to be a vector of "microarray" or "RNA-seq" values corresponding to the technologies in the corresponding members of `ge_objects`. If left at NULL, all datasets in `ge_objects` are assumed to be from microarrays.
#'
#' *Could be passing in a concrete contrast argument instead with a more
#' complex contrast once needed (when/if implementing, remember that these
#' will need to be supplied with `make.names`'d variable names) - for now
#' working with simple contrasts, so don't need to implement this.
#' *This expression should involve all levels from the "active_levels" and
#' "reference_levels" lists.
#'
#' Remember to pre-merge any synonymous levels before using OMA!
if (!is.list(ge_objects)) { stop("`ge_objects` needs to be a list") }
# lapply(ge_objects, check_ge_obj)
if (!is.null(technology) & !all(technology %in% c("microarray", "RNA-seq"))) { stop("`technology` specified not available!") }
if (!is.null(technology) & (length(technology) != length(ge_objects))) { stop("`technology` needs to be specified for each of `ge_objects`!") }
if (is.null(technology))
technology <- rep("microarray", length(ge_objects$data))
if (is.null(gene_sets)) {
data(c2BroadSets, package = "GSVAdata")
gene_sets <- c2BroadSets
} else if (gene_sets == "REACTOME") {
data(c2BroadSets, package = "GSVAdata")
gene_sets <- c2BroadSets[grep("^REACTOME", names(c2BroadSets))]
}
ge_info <- ge_objects$info
contrast_info <- ge_info$contrast
diff_results <- lapply(1:length(ge_objects$data), function (d_num) {
active_nicename <- make.names(contrast_info[["active"]])
reference_nicename <- make.names(contrast_info[["reference"]])
contrast_nicename <- paste0(active_nicename, "-", reference_nicename)
tech <- technology[d_num]
diff_data <- ge_objects$data[[d_num]]
symbol_to_entrezid <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys = rownames(diff_data$expr), column = "ENTREZID", keytype = "SYMBOL", multiVals = "first")
diff_data$for_gsva <- tibble(entrezid = symbol_to_entrezid) %>% bind_cols(diff_data$expr %>% as_tibble) %>% drop_na(entrezid) %>% column_to_rownames("entrezid")
if (tech == "RNA-seq")
diff_data$gsva <- GSVA::gsva(diff_data$for_gsva %>% as.matrix, gene_sets, min.sz = 10, max.sz = 500, method = "gsva", rnaseq = TRUE, mx.diff = TRUE, verbose = FALSE)
else
diff_data$gsva <- GSVA::gsva(diff_data$for_gsva %>% as.matrix, gene_sets, min.sz = 10, max.sz = 500, method = "gsva", mx.diff = TRUE, verbose = FALSE)
diff_f <- paste0('~ 0 + diff_data[["pheno"]][, "', contrast_info$variable, '"]')
if (!is.null(diff_data$covariates))
diff_f <- paste0(diff_f, " + ", sapply(diff_data$covariates, function (x) paste0('diff_data$pheno[, "', x, '"]')), collapse = " + ")
diff_f <- as.formula(diff_f)
# print(diff_data[["pheno"]][, contrast_info$variable])
# print(contrast_info)
# print(diff_data[["pheno"]][, "gender"])
mm <- model.matrix(diff_f)
colnames(mm)[1:length(levels(diff_data[["pheno"]][, contrast_info$variable]))] <- levels(diff_data[["pheno"]][, contrast_info$variable])
colnames(mm) <- make.names(colnames(mm))
print(colnames(mm))
con <- limma::makeContrasts(contrasts = contrast_nicename, levels = mm)
lmfit_partial <- purrr::safely(limma::lmFit) %>% purrr::partial(diff_data$gsva, design = mm)
lmfit_realized <- lmfit_partial()$result
if (!is.null(diff_data$block)) {
corfit <- limma::duplicateCorrelation(diff_data$gsva, mm, block = diff_data$pheno[, diff_data$block])
lmfit_partial_cor <- lmfit_partial %>% purrr::partial(block = diff_data$pheno[, diff_data$block], correlation = corfit$result$consensus.correlation)
lmfit_partial_cor_realized <- lmfit_partial_cor()
if (is.null(lmfit_partial_cor_realized$error)) {
lmfit_realized <- lmfit_partial_cor_realized$result
}
}
limma_out <- limma::eBayes(
limma::contrasts.fit(
lmfit_realized,
con
)
)
# limma::topTable(limma_out, number = Inf) %>% tibble::rownames_to_column(var = "gene") %>% head
active_n <- diff_data$gsva[rownames(limma_out$coefficients), diff_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$active) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
lapply(function (x) {
x[!is.na(x)] %>% length
}) %>% unlist %>% unname
reference_n <- diff_data$gsva[rownames(limma_out$coefficients), diff_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$reference) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
lapply(function (x) {
x[!is.na(x)] %>% length
}) %>% unlist %>% unname
return(
tibble(
gene = rownames(limma_out$coefficients),
coeff = limma_out$coefficients %>% as.vector,
p_value = limma_out$p.value %>% as.vector,
cohen_d = limma_out$coefficients[, 1] / sqrt(limma_out$s2.post),
variance = limma_out$s2.post * limma_out$stdev.unscaled[, 1]^2, # According to Gordon Smyth in https://support.bioconductor.org/p/70175/
active_n = active_n,
reference_n = reference_n
)
)
})
names(diff_results) <- names(ge_objects$data)
return(diff_results)
}
run_ma <- function (dat_results, dat_levels = NULL, es_var = "cohen_d", variance_var = NULL, se_var = NULL, p_adj_method = c("BH", p.adjust.methods), Q_p_cutoff = 0.05, min_num_datasets_to_use_gene = 1, parallel = FALSE, n_cores = future::availableCores() - 1) {
#' `run_ma` uses metafor's `rma` and `rma.mv` for meta-analysis.
#'
#' @param `dat_results` must be a data frame with, at the minimum, a "gene"
#' column, an effect size column, and a variance column.
#' @param `dat_levels` needs to be passed if we're dealing with multilevel
#' data. If this is the case, the level has to be specified for each data-
#' set. This will trigger the calculation of multi-level fixed and random
#' effects.
#' @param `es_var` here is unrestricted, but if the results are from `run_dge`,
#' then this should be either "cohen_d", or "coeff".
#' @param `variance_var` here is unrestricted, but if the results are from
#' `run_dge`, then this should be "p_pvalue".
#' @param `se_var` supply the name of the column containing the standard
#' errors if you've standard errors at your disposal rather than variances.
#' @param `parallel` - if set to TRUE, furrr will be used instead of purrr in
#' the M-A loop
p_adj_method <- match.arg(p_adj_method)
if (!is.null(variance_var) & !is.null(se_var)) { stop("One of `variance_var` or `se_var` should be set, but not both!") }
if (is.null(variance_var) & is.null(se_var)) variance_var <- "variance"
se_or_variance <- ifelse(is.null(se_var), "variance", "se")
sevar <- ifelse(is.null(se_var), variance_var, se_var)
if (!(parallel %in% c(TRUE, FALSE))) { stop("`parallel` can only be set to TRUE or FALSE.") }
if ((n_cores %% 1) != 0) { stop("`n_cores` must be an integer number!") }
if (parallel) {
future::plan(future::multiprocess, workers = n_cores)
map <- furrr::future_map
} else {
map <- lapply
}
if (!is.null(dat_levels) & (length(dat_levels) != length(dat_results))) { stop("The length of `dat_levels` must be equal to the number of datasets in `dat_results`.") }
ma_info <- list(
run_date = date(),
es_variable = es_var,
se_or_variance = se_or_variance,
variance_variable = variance_var,
se_variable = se_var,
p_adj_method = p_adj_method,
Q_p_cutoff = Q_p_cutoff,
min_num_datasets_to_use_gene = min_num_datasets_to_use_gene
)
if (!is.null(names(dat_results))) {
dat_names <- 1:length(dat_results) %>% as.character
names(dat_results) <- dat_names
} else {
dat_names <- names(dat_results)
}
dat_levels_df <- tibble(dataset = dat_names) %>% mutate(dat_level = ifelse(is.null(dat_levels), rep(NA, length(dat_names)), dat_levels))
genes_across <- dat_results %>% lapply(function (x) x$gene) %>% purrr::reduce(union)
genewise_es <- c(tibble(gene = genes_across) %>% list, dat_results %>% map(function (x) x %>% select(gene, !!rlang::sym(es_var)))) %>% purrr::reduce(left_join, by = "gene")
colnames(genewise_es) <- c("gene", names(dat_results))
# genewise_es_n <- genewise_es %>% tidyr::gather("key", "es", -gene) %>% select(-key) %>% tidyr::drop_na() %>% group_by(gene) %>% summarize(n = n()) %>% ungroup
genewise_sevar <- c(tibble(gene = genes_across) %>% list, dat_results %>% map(function (x) x %>% select(gene, !!rlang::sym(sevar)))) %>% purrr::reduce(left_join, by = "gene")
colnames(genewise_sevar) <- c("gene", names(dat_results))
# genewise_var_n <- genewise_var %>% tidyr::gather("key", "var", -gene) %>% select(-key) %>% tidyr::drop_na() %>% group_by(gene) %>% summarize(n = n()) %>% ungroup
if (!all(genewise_es$gene == genewise_sevar$gene))
stop("All genes and all numbers of datasets have to be equal in the ES and variance outputs!")
ma_ready <- genewise_es %>% tidyr::gather("dataset", "es", -gene) %>% left_join(genewise_sevar %>% tidyr::gather("dataset", "sevar", -gene), by = c("gene", "dataset")) %>% tidyr::drop_na() %>% left_join(dat_levels_df, by = "dataset") %>% group_split(gene)
names(ma_ready) <- sapply(ma_ready, function (x) x[1, "gene"])
ma_start_time <- Sys.time()
if (is.null(dat_levels)) {
ma_results <- map(ma_ready, function (ma_set) {
list(
ma_fixed = metafor::rma(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), method = "FE"),
ma_random = metafor::rma(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), method = "DL")
)
})
} else {
ma_results <- map(ma_ready, function (ma_set) {
list(
ma_fixed = metafor::rma.mv(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), random = ~ 1 | dat_level, dat = ma_set, method = "DL"),
ma_random = metafor::rma.mv(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), random = ~ 1 | dat_level/dataset, dat = ma_set, method = "DL")
)
})
}
ma_done_time <- Sys.time()
ma_time_taken <- ma_done_time - ma_start_time
cat("\nPure meta-analysis took:\n")
print(ma_time_taken)
ma_object <- tibble(gene = names(ma_ready)) %>% bind_cols(
tibble(n_datasets = sapply(ma_ready, nrow), datasets = sapply(ma_ready, function (x) x$dataset %>% list))
) %>% bind_cols(
ma_results %>% map(function (x) {
tibble(
fixed_es = x$ma_fixed$beta[1], fixed_se = x$ma_fixed$se, fixed_ci_lb = x$ma_fixed$ci.lb, fixed_ci_ub = x$ma_fixed$ci.ub, fixed_pval = x$ma_fixed$pval,
random_es = x$ma_random$beta[1], random_se = x$ma_random$se, random_ci_lb = x$ma_random$ci.lb, random_ci_ub = x$ma_random$ci.ub, random_pval = x$ma_random$pval, tau2 = x$ma_random$tau2, QEp = x$ma_random$QEp
)
}) %>% bind_rows
) %>% filter(n_datasets >= min_num_datasets_to_use_gene) %>% mutate(
fixed_adj_pval = p.adjust(fixed_pval, method = p_adj_method),
random_adj_pval = p.adjust(random_pval, method = p_adj_method),
hybrid_adj_pval = ifelse(QEp < Q_p_cutoff, random_pval, fixed_pval) %>% p.adjust(method = p_adj_method),
hybrid_es = ifelse(QEp < Q_p_cutoff, random_es, fixed_es),
hybrid_se = ifelse(QEp < Q_p_cutoff, random_se, fixed_se)
)
filtered_adj_p <- min_num_datasets_to_use_gene:max(ma_object$n_datasets) %>% lapply(function (cutoff) {
ma_object %>% filter(n_datasets >= cutoff) %>% select(gene, fixed_pval, random_pval, QEp) %>%
transmute(
gene,
fixed_adj_pval = p.adjust(fixed_pval, method = p_adj_method),
random_adj_pval = p.adjust(random_pval, method = p_adj_method),
hybrid_adj_pval = ifelse(QEp <= Q_p_cutoff, random_pval, fixed_pval) %>% p.adjust(method = p_adj_method)
)
})
filtered_adj_p <- list("fixed_adj_pval", "random_adj_pval", "hybrid_adj_pval") %>% lapply(function (pval) {
filtered_adj_p %>% purrr::reduce(function (x, y) {left_join(x, y %>% select(gene, !!rlang::sym(pval)), by = "gene")}, .init = ma_object %>% select(gene))
})
names(filtered_adj_p) <- c("fixed_adj_pval", "random_adj_pval", "hybrid_adj_pval")
return(
list(
ma = ma_object,
info = ma_info,
adj_pval = filtered_adj_p,
adj_pval_cutoffs = min_num_datasets_to_use_gene:max(ma_object$n_datasets)
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.