R/meta_analysis.R

#' @import dplyr
#' @export run_dge
#' @export run_dgsva
#' @export run_ma
#' @export prepare_for_dge



prepare_for_dge <- function (datasets, dataset_names, contrast, covariates = NULL, scrutinize_covariates = FALSE, blocks = NULL, scrutinize_blocks = FALSE, sample_filters = NULL, min_num_datasets_to_use_gene = 1, min_genes_to_keep_dataset = 5, min_samples_per_level = 2, fix_NA = c("none", "mean")) {

  #' Remember to pre-merge any synonymous contrast levels before using OMA!
  #'
  #' datasets - this needs to ba a list of dataset objects.
  #' dataset_names - this needs to ba a vector of dataset names - same order as datasets above.
  #' contrast - a three-member list, where the first filed is the name of the variable to use from "pheno", second is
  #'   the name of the active level in the contrast, and the third is the name of the reference level.
  #' sample_filters - a quosure of filters; previously this was a string that would be passed into dplyr's `filter_()`;
  #'   however, `filter_()` was deprecated, and this was quite error-prone anyway, as it required a double qotations in
  #'   and similar risky business in the case of strings. So now a full expression is expected that goes directly into
  #'   dplyr's `filter()` - send it in inside `rlang::quo()` which will make it a quosure.
  #' response_variable - informs which variable is the one to use as a contrast


  if (!is.null(sample_filters) & !rlang::is_quosure(sample_filters)) { stop("`sample_filters` needs to be a quosure!") }
  if (!is.list(contrast) | (length(contrast) != 3) | !all(names(contrast) == c("variable", "active", "reference"))) { stop("`contrast` needs to be a 3-value list, with the following properties: \"variable\" (the name of the contrast variable in the data), \"active\" (the active level(s) of the contrast), \"reference\" (reference level(s) of the contrast).") }
  if (!is.null(covariates) & !is.list(covariates) & !is.vector(covariates)) { stop("`covariates` needs to be a list with per-dataset covariates, or a vector of covariates that will be used across all datasets!") }
  if (is.list(covariates) & !all(names(covariates) %in% dataset_names)) { stop("`covariates` needs to be a list with names corresponding to dataset names!") }
  if (!is.null(blocks) & !is.list(blocks) & !is.vector(blocks)) { stop("`blocks` needs to be a list with per-dataset blocks, or a vector of blocks that will be used across all datasets!") }
  if (is.list(blocks) & !all(names(blocks) %in% dataset_names)) { stop("`blocks` needs to be a list with names corresponding to dataset names!") }
  if (is.list(blocks) & !all(sapply(blocks, function (x) length(x) == 1))) { stop("`blocks` needs to be a list with only ONE variable name per dataset!") }
  if(is.vector(blocks) & (length(blocks) != 1)) { stop("If `blocks` is a vector, than it needs to be a vector of length 1 (only a single variable can be used to identify duplicates).") }

  lapply(names(datasets), function (dat_nam) {
    dat <- datasets[[dat_nam]]
    if (!all(names(dat) %in% c("gene_expression_data", "sample_data"))) { stop(paste0("Each dataset needs to be a list with two dataframes, one for gene expression (a named list element called \"gene_expression_data\"), and sample data (another named list element called \"sample_data\"). This is not the case in ", dat_nam)) }
    if (!is.data.frame(dat$gene_expression_data) & !is.matrix(dat$gene_expression_data)) { stop(paste0("A \"gene_expression_data\" field is present in ", dat_nam, ", but it's not a data frame or matrix.")) }
    if (!is.data.frame(dat$sample_data)) { stop(paste0("A \"sample_data\" field is present in ", dat_nam, ", but it's not a data frame.")) }
    if (!("sample_name" %in% colnames(dat$sample_data))) { stop(paste0("The \"sample_data\" dataframe needs to have a \"sample_name\" column corresponding to the rownames in the expression data. This column wasn't found in ", dat_nam, "!")) }
  })


  ## testing
  # contrast <- list(
  #   variable = "disease_state",
  #   active = "atopic dermatitis",
  #   reference = "normal control"
  # )


  response_variable <- contrast[["variable"]]
  active_levels <- contrast[["active"]]
  reference_levels <- contrast[["reference"]]


  fix_NA <- match.arg(fix_NA)


  genes_to_use <- sapply(datasets, function(x) colnames(x$gene_expression_data)) %>% unlist %>% unname %>% table
  genes_to_use <- names(genes_to_use[genes_to_use >= min_num_datasets_to_use_gene])

  ## testing
  # covariates <- "gender"
  if (is.vector(covariates)) {
    covariates <- rep(list(covariates), length(dataset_names))
    names(covariates) <- dataset_names
  }

  if (!is.null(covariates) & scrutinize_covariates) {
      covariates <- lapply(1:length(datasets), function (d_num) {
        remaining_covariates <- intersect(covariates[[d_num]], colnames(datasets[[d_num]]$sample_data))
        if (length(remaining_covariates) > 0)
          remaining_covariates <- remaining_covariates[remaining_covariates %>% sapply(function (x) datasets[[d_num]]$sample_data %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(x) %>% as.character %>% unique %>% length >= 2)]
        if (length(remaining_covariates) == 0)
          remaining_covariates <- NULL
        return(remaining_covariates)
      })
      names(covariates) <- dataset_names
  }


  if (is.vector(blocks)) {
    blocks <- rep(list(blocks), length(dataset_names))
    names(blocks) <- dataset_names
  }

  if (!is.null(blocks) & scrutinize_blocks) {
      blocks <- lapply(1:length(datasets), function (d_num) {
        remaining_blocks <- intersect(blocks[[d_num]], colnames(datasets[[d_num]]$sample_data))
        if (length(remaining_blocks) > 0)
          remaining_blocks <- remaining_blocks[remaining_blocks %>% sapply(function (x) datasets[[d_num]]$sample_data %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(x) %>% as.character %>% unique %>% length >= 2)]
        if (length(remaining_blocks) == 0)
          remaining_blocks <- NULL
        return(remaining_blocks)
      })
      names(blocks) <- dataset_names
  }


  dge_info <- list(
    run_date = date(),
    min_num_datasets_to_use_gene = min_num_datasets_to_use_gene,
    min_genes_to_keep_dataset = min_genes_to_keep_dataset,
    min_samples_per_level = min_samples_per_level,
    contrast = contrast,
    scrutinize_covariates = scrutinize_covariates,
    covariates = covariates,
    scrutinize_blocks = scrutinize_blocks,
    blocks = blocks,
    fix_NA = fix_NA,
    input_datasets = dataset_names
  )


  dge_data <- lapply(1:length(datasets), function (d_num) {

    dataset_name <- dataset_names[d_num]
    d <- datasets[[d_num]]


    if (!all(covariates[[dataset_name]] %in% colnames(d$sample_data))) { stop(paste0("Covariates specified for dataset ", dataset_name, " that don't exist in its pheno data")) }
    if (!all(blocks[[dataset_name]] %in% colnames(d$sample_data))) { stop(paste0("`blocks` specified for dataset ", dataset_name, " that don't exist in its pheno data")) }

    cat(paste0("dataset: ", dataset_name, "\n"))


    genes_to_use_this_dataset <- intersect(genes_to_use, colnames(d$gene_expression_data))


    data_obj <- list()
    data_obj$status <- list(
      NA_present = FALSE,
      less_than_min_genes = length(genes_to_use_this_dataset) < min_genes_to_keep_dataset,
      expr_dims_eq_0 = c(FALSE, FALSE),
      keep = TRUE
    )
    data_obj$pheno <- d$sample_data
    data_obj$covariates <- covariates[[dataset_name]]
    data_obj$block <- blocks[[dataset_name]]

    print("data_obj$status:")
    print(data_obj$status)


    ## Checking if pheno meets the criteria --------------------------------- ##

    if (!is.null(sample_filters))
      data_obj[["pheno"]] <- data_obj[["pheno"]] %>% filter(!!sample_filters) %>% as.data.frame

    data_obj[["pheno"]][, response_variable] <- factor(data_obj[["pheno"]][, response_variable])


    print(data_obj[["pheno"]][, c("sample_name", "tissue", "disease_state", "cell_type", "sample_pathology")] %>% head)


    c_n <- unname(unlist(data_obj[["pheno"]]$sample_name))
    rownames(data_obj[["pheno"]]) <- c_n

    print(levels(as.factor(unname(unlist(data_obj[["pheno"]][, response_variable])))))

    print("response variable bit")
    print(data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character %>% unique)

    data_obj$status$less_than_2_response_levels <- data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character %>% unique %>% length < 2
    print("less than 2 required levels?")
    print(data_obj$status$less_than_2_response_levels)

    data_obj$status$less_than_min_samples_per_level <- (function () {
      tmp_dat <- data_obj[["pheno"]] %>% filter(!!rlang::sym(response_variable) %in% c(active_levels, reference_levels)) %>% pull(!!response_variable) %>% as.character
      lvls <- tmp_dat %>% unique
      sapply(lvls, function (lvl) {
        length(tmp_dat[tmp_dat == lvl]) < min_samples_per_level
      }) %>% unlist %>% unname %>% any
    })()

    if (data_obj$status$less_than_min_genes) {
      cat(paste0("Less than ", min_genes_to_keep_dataset, " viable genes in ", dataset_name, "; eliminating from further analysis!\n"))
    }
    if (data_obj$status$less_than_2_response_levels) {
      cat(paste0("Less than 2 levels present in the response variable! Eliminating dataset ", dataset_name, "\n"))
    }
    if (data_obj$status$less_than_min_samples_per_level) {
      cat(paste0("Less than ", min_samples_per_level, " samples with a given response level present in the response variable! Eliminating dataset ", dataset_name, "\n"))
    }
    if (data_obj$status$less_than_2_response_levels | data_obj$status$less_than_min_samples_per_level | data_obj$status$less_than_min_genes) {
      data_obj$status$keep <- FALSE
      return(data_obj)
    }


    c_n <- rownames(data_obj[["pheno"]])

    ## ---------------------------------------------------------------------- ##


    ## Expression data ------------------------------------------------------ ##

    d$gene_expression_data <- d$gene_expression_data[, genes_to_use_this_dataset]
    data_obj$expr <- d$gene_expression_data %>% as.data.frame


    if (any(is.na(data_obj[["expr"]])) | any(is.null(data_obj[["expr"]]))) {

      cat("\nNA values present...\n\n")
      data_obj$status$NA_present <- TRUE

      if (fix_NA == "mean") {

        gene_means <- lapply(data_obj[["expr"]], function (col) {
          if (any(is.na(col)) | any(is.null(col)))
            col %>% mean(na.rm = TRUE)
          ## col[!is.nan(col)] %>% unlist %>% unname %>% mean(na.rm = TRUE)
          else NULL
        })
        names(gene_means) <- colnames(data_obj[["expr"]])
        gene_means[sapply(gene_means, is.null)] <- NULL
        gene_means <- unlist(gene_means)

        cat(paste0("NAs need fixing in ", length(gene_means), " genes\n"))
        # print(gene_means[1:10])

        cat("fixing NA's: ")
        for (gene_num in 1:length(gene_means)) {
          gene <- names(gene_means)[gene_num]
          if (gene_num %% 100 == 0)
            cat(paste0(gene_num, " "))
          data_obj[["expr"]][is.na(data_obj[["expr"]][, gene]), gene] <- gene_means[gene]
        }
        print("")

        cat(paste0("Checking if there are any NAs left... ", any(is.na(data_obj[["expr"]])), "\n"))
      }
    }


    r_n <- colnames(data_obj[["expr"]])


    ## c_n below comes from the names of samples in sample_data
    data_obj[["expr"]] <- data_obj[["expr"]][c_n, ] %>% t %>% as.data.frame

    colnames(data_obj[["expr"]]) <- c_n
    rownames(data_obj[["expr"]]) <- r_n


    data_obj[["keys"]] <- rownames(data_obj[["expr"]])
    names(data_obj[["keys"]]) <- rownames(data_obj[["expr"]])
    data_obj[["dataset_name"]] <- dataset_name


    c_n <- colnames(data_obj[["expr"]])
    r_n <- rownames(data_obj[["expr"]])
    temp <- data_obj[["expr"]] %>% t %>%
      as.data.frame %>% lapply(function (col) {
        as.numeric(col)
      }) %>% do.call(cbind, .) %>% as.matrix %>% t
    data_obj[["expr"]] <- temp
    colnames(data_obj[["expr"]]) <- c_n
    rownames(data_obj[["expr"]]) <- r_n

    ## ---------------------------------------------------------------------- ##


    print(paste0("dim: ", dim(data_obj[["expr"]])))

    if (0 %in% dim(data_obj[["expr"]])) {
      data_obj$status$expr_dims_eq_0 <- dim(data_obj[["expr"]]) == 0
      data_obj$status$keep <- FALSE
    }

    return(data_obj)

  })

  names(dge_data) <- dataset_names
  dge_objects <- list(
    info = dge_info,
    dataset_prep_info = dge_data %>% lapply(function (x) x$status)
  )
  names(dge_objects$dataset_prep_info) <- dataset_names
  dge_data[sapply(dge_data, function (x) {x$status$keep == FALSE})] <- NULL # Making sure that 0-length objects aren't retained
  dge_objects$data <- dge_data

  return(dge_objects)

}



check_ge_obj <- function (ge_obj) {}



run_dge <- function (ge_objects, technology = NULL) {

  #' Here using limma for #1.
  #'
  #' @param `technology` needs to be a vector of "microarray" or "RNA-seq" values corresponding to the technologies in the corresponding members of `ge_objects`. If left at NULL, all datasets in `ge_objects` are assumed to be from microarrays.
  #'
  #' *Could be passing in a concrete contrast argument instead with a more
  #' complex contrast once needed (when/if implementing, remember that these
  #' will need to be supplied with `make.names`'d variable names) - for now
  #' working with simple contrasts, so don't need to implement this.
  #' *This expression should involve all levels from the "active_levels" and
  #' "reference_levels" lists.
  #'
  #' Remember to pre-merge any synonymous levels before using OMA!


  if (!is.list(ge_objects)) { stop("`ge_objects` needs to be a list") }
  # lapply(ge_objects, check_ge_obj)

  if (!is.null(technology) & !all(technology %in% c("microarray", "RNA-seq"))) { stop("`technology` specified not available!") }
  if (!is.null(technology) & (length(technology) != length(ge_objects))) { stop("`technology` needs to be specified for each of `ge_objects`!") }

  if (is.null(technology))
    technology <- rep("microarray", length(ge_objects$data))


  ge_info <- ge_objects$info
  contrast_info <- ge_info$contrast


  ge_results <- lapply(1:length(ge_objects$data), function (d_num) {

    active_nicename <- make.names(contrast_info[["active"]])
    reference_nicename <- make.names(contrast_info[["reference"]])
    contrast_nicename <- paste0(active_nicename, "-", reference_nicename)

    tech <- technology[d_num]

    ge_data <- ge_objects$data[[d_num]]

    ge_data$expr_ready <- ge_data$expr


    if (tech == "RNA-seq") {
      dlist <- edgeR::DGEList(counts = ge_data$expr_ready, group = ge_data$pheno[, contrast_info$variable])
      keep <- edgeR::filterByExpr(dlist, min.count = 0, min.total.count = 10*ncol(ge_data$expr_ready))
      dlist <- dlist[keep, , keep.list.sizes = T]
      tmm <- edgeR::calcNormFactors(dlist, method = "TMM")
      ge_data$expr_ready <- tmm
    }

    ge_f <- paste0('~ 0 + ge_data[["pheno"]][, "', contrast_info$variable, '"]')
    if (!is.null(ge_data$covariates))
      ge_f <- paste0(ge_f, " + ", sapply(ge_data$covariates, function (x) paste0('ge_data$pheno[, "', x, '"]')), collapse = " + ")
    ge_f <- as.formula(ge_f)

    # print(ge_data[["pheno"]][, contrast_info$variable])
    # print(contrast_info)
    # print(ge_data[["pheno"]][, "gender"])

    mm <- model.matrix(ge_f)
    colnames(mm)[1:length(levels(ge_data[["pheno"]][, contrast_info$variable]))] <- levels(ge_data[["pheno"]][, contrast_info$variable])
    colnames(mm) <- make.names(colnames(mm))

    print(colnames(mm))

    con <- limma::makeContrasts(contrasts = contrast_nicename, levels = mm)


    if (tech == "RNA-seq")
      ge_data$expr_ready <- limma::voom(ge_data$expr_ready, mm)


    lmfit_partial <- purrr::safely(limma::lmFit) %>% purrr::partial(ge_data$expr_ready, design = mm)
    lmfit_realized <- lmfit_partial()$result

    if (!is.null(ge_data$block)) {
      corfit <- limma::duplicateCorrelation(ge_data$expr_ready, mm, block = ge_data$pheno[, ge_data$block])
      lmfit_partial_cor <- lmfit_partial %>% purrr::partial(block = ge_data$pheno[, ge_data$block], correlation = corfit$result$consensus.correlation)
      lmfit_partial_cor_realized <- lmfit_partial_cor()
      if (is.null(lmfit_partial_cor_realized$error)) {
        lmfit_realized <- lmfit_partial_cor_realized$result
      }
    }

    limma_out <- limma::eBayes(
      limma::contrasts.fit(
        lmfit_realized,
        con
      )
    )

    # limma::topTable(limma_out, number = Inf) %>% tibble::rownames_to_column(var = "gene") %>% head


    active_n <- ge_data$expr[rownames(limma_out$coefficients), ge_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$active) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
      lapply(function (x) {
        x[!is.na(x)] %>% length
      }) %>% unlist %>% unname

    reference_n <- ge_data$expr[rownames(limma_out$coefficients), ge_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$reference) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
      lapply(function (x) {
        x[!is.na(x)] %>% length
      }) %>% unlist %>% unname


    return(
      tibble(
        gene = rownames(limma_out$coefficients),
        coeff = limma_out$coefficients %>% as.vector,
        p_value = limma_out$p.value %>% as.vector,
        cohen_d = limma_out$coefficients[, 1] / sqrt(limma_out$s2.post),
        variance = limma_out$s2.post * limma_out$stdev.unscaled[, 1]^2, # According to Gordon Smyth in https://support.bioconductor.org/p/70175/
        active_n = active_n,
        reference_n = reference_n
      )
    )

  })
  names(ge_results) <- names(ge_objects$data)

  return(ge_results)
}



run_dgsva <- function (ge_objects, technology = NULL, gene_sets = NULL) {

  #' `run_dgsva`, as in "run differential GSVA".
  #' Here using gsva and limma.
  #'
  #' @param `technology` needs to be a vector of "microarray" or "RNA-seq" values corresponding to the technologies in the corresponding members of `ge_objects`. If left at NULL, all datasets in `ge_objects` are assumed to be from microarrays.
  #'
  #' *Could be passing in a concrete contrast argument instead with a more
  #' complex contrast once needed (when/if implementing, remember that these
  #' will need to be supplied with `make.names`'d variable names) - for now
  #' working with simple contrasts, so don't need to implement this.
  #' *This expression should involve all levels from the "active_levels" and
  #' "reference_levels" lists.
  #'
  #' Remember to pre-merge any synonymous levels before using OMA!


  if (!is.list(ge_objects)) { stop("`ge_objects` needs to be a list") }
  # lapply(ge_objects, check_ge_obj)

  if (!is.null(technology) & !all(technology %in% c("microarray", "RNA-seq"))) { stop("`technology` specified not available!") }
  if (!is.null(technology) & (length(technology) != length(ge_objects))) { stop("`technology` needs to be specified for each of `ge_objects`!") }

  if (is.null(technology))
    technology <- rep("microarray", length(ge_objects$data))


  if (is.null(gene_sets)) {
    data(c2BroadSets, package = "GSVAdata")
    gene_sets <- c2BroadSets
  } else if (gene_sets == "REACTOME") {
    data(c2BroadSets, package = "GSVAdata")
    gene_sets <- c2BroadSets[grep("^REACTOME", names(c2BroadSets))]
  }

  ge_info <- ge_objects$info
  contrast_info <- ge_info$contrast


  diff_results <- lapply(1:length(ge_objects$data), function (d_num) {

    active_nicename <- make.names(contrast_info[["active"]])
    reference_nicename <- make.names(contrast_info[["reference"]])
    contrast_nicename <- paste0(active_nicename, "-", reference_nicename)

    tech <- technology[d_num]

    diff_data <- ge_objects$data[[d_num]]


    symbol_to_entrezid <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys = rownames(diff_data$expr), column = "ENTREZID", keytype = "SYMBOL", multiVals = "first")
    diff_data$for_gsva <- tibble(entrezid = symbol_to_entrezid) %>% bind_cols(diff_data$expr %>% as_tibble) %>% drop_na(entrezid) %>% column_to_rownames("entrezid")


    if (tech == "RNA-seq")
      diff_data$gsva <- GSVA::gsva(diff_data$for_gsva %>% as.matrix, gene_sets, min.sz = 10, max.sz = 500, method = "gsva", rnaseq = TRUE, mx.diff = TRUE, verbose = FALSE)
    else
      diff_data$gsva <- GSVA::gsva(diff_data$for_gsva %>% as.matrix, gene_sets, min.sz = 10, max.sz = 500, method = "gsva", mx.diff = TRUE, verbose = FALSE)


    diff_f <- paste0('~ 0 + diff_data[["pheno"]][, "', contrast_info$variable, '"]')
    if (!is.null(diff_data$covariates))
      diff_f <- paste0(diff_f, " + ", sapply(diff_data$covariates, function (x) paste0('diff_data$pheno[, "', x, '"]')), collapse = " + ")
    diff_f <- as.formula(diff_f)

    # print(diff_data[["pheno"]][, contrast_info$variable])
    # print(contrast_info)
    # print(diff_data[["pheno"]][, "gender"])

    mm <- model.matrix(diff_f)
    colnames(mm)[1:length(levels(diff_data[["pheno"]][, contrast_info$variable]))] <- levels(diff_data[["pheno"]][, contrast_info$variable])
    colnames(mm) <- make.names(colnames(mm))

    print(colnames(mm))

    con <- limma::makeContrasts(contrasts = contrast_nicename, levels = mm)


    lmfit_partial <- purrr::safely(limma::lmFit) %>% purrr::partial(diff_data$gsva, design = mm)
    lmfit_realized <- lmfit_partial()$result

    if (!is.null(diff_data$block)) {
      corfit <- limma::duplicateCorrelation(diff_data$gsva, mm, block = diff_data$pheno[, diff_data$block])
      lmfit_partial_cor <- lmfit_partial %>% purrr::partial(block = diff_data$pheno[, diff_data$block], correlation = corfit$result$consensus.correlation)
      lmfit_partial_cor_realized <- lmfit_partial_cor()
      if (is.null(lmfit_partial_cor_realized$error)) {
        lmfit_realized <- lmfit_partial_cor_realized$result
      }
    }

    limma_out <- limma::eBayes(
      limma::contrasts.fit(
        lmfit_realized,
        con
      )
    )

    # limma::topTable(limma_out, number = Inf) %>% tibble::rownames_to_column(var = "gene") %>% head


    active_n <- diff_data$gsva[rownames(limma_out$coefficients), diff_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$active) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
      lapply(function (x) {
        x[!is.na(x)] %>% length
      }) %>% unlist %>% unname

    reference_n <- diff_data$gsva[rownames(limma_out$coefficients), diff_data$pheno %>% filter(!!rlang::sym(contrast_info$variable) == contrast_info$reference) %>% pull(sample_name)] %>% t %>% as.data.frame %>%
      lapply(function (x) {
        x[!is.na(x)] %>% length
      }) %>% unlist %>% unname


    return(
      tibble(
        gene = rownames(limma_out$coefficients),
        coeff = limma_out$coefficients %>% as.vector,
        p_value = limma_out$p.value %>% as.vector,
        cohen_d = limma_out$coefficients[, 1] / sqrt(limma_out$s2.post),
        variance = limma_out$s2.post * limma_out$stdev.unscaled[, 1]^2, # According to Gordon Smyth in https://support.bioconductor.org/p/70175/
        active_n = active_n,
        reference_n = reference_n
      )
    )

  })
  names(diff_results) <- names(ge_objects$data)

  return(diff_results)
}



run_ma <- function (dat_results, dat_levels = NULL, es_var = "cohen_d", variance_var = NULL, se_var = NULL, p_adj_method = c("BH", p.adjust.methods), Q_p_cutoff = 0.05, min_num_datasets_to_use_gene = 1, parallel = FALSE, n_cores = future::availableCores() - 1) {

  #' `run_ma` uses metafor's `rma` and `rma.mv` for meta-analysis.
  #'
  #' @param `dat_results` must be a data frame with, at the minimum, a "gene"
  #'   column, an effect size column, and a variance column.
  #' @param `dat_levels` needs to be passed if we're dealing with multilevel
  #'   data. If this is the case, the level has to be specified for each data-
  #'   set. This will trigger the calculation of multi-level fixed and random
  #'   effects.
  #' @param `es_var` here is unrestricted, but if the results are from `run_dge`,
  #'   then this should be either "cohen_d", or "coeff".
  #' @param `variance_var` here is unrestricted, but if the results are from
  #'   `run_dge`, then this should be "p_pvalue".
  #' @param `se_var` supply the name of the column containing the standard
  #'   errors if you've standard errors at your disposal rather than variances.
  #' @param `parallel` - if set to TRUE, furrr will be used instead of purrr in
  #'   the M-A loop

  p_adj_method <- match.arg(p_adj_method)


  if (!is.null(variance_var) & !is.null(se_var)) { stop("One of `variance_var` or `se_var` should be set, but not both!") }
  if (is.null(variance_var) & is.null(se_var)) variance_var <- "variance"

  se_or_variance <- ifelse(is.null(se_var), "variance", "se")
  sevar <- ifelse(is.null(se_var), variance_var, se_var)


  if (!(parallel %in% c(TRUE, FALSE))) { stop("`parallel` can only be set to TRUE or FALSE.") }
  if ((n_cores %% 1) != 0) { stop("`n_cores` must be an integer number!") }

  if (parallel) {
    future::plan(future::multiprocess, workers = n_cores)
    map <- furrr::future_map
  } else {
    map <- lapply
  }


  if (!is.null(dat_levels) & (length(dat_levels) != length(dat_results))) { stop("The length of `dat_levels` must be equal to the number of datasets in `dat_results`.") }

  ma_info <- list(
    run_date = date(),
    es_variable = es_var,
    se_or_variance = se_or_variance,
    variance_variable = variance_var,
    se_variable = se_var,
    p_adj_method = p_adj_method,
    Q_p_cutoff = Q_p_cutoff,
    min_num_datasets_to_use_gene = min_num_datasets_to_use_gene
  )


  if (!is.null(names(dat_results))) {
    dat_names <- 1:length(dat_results) %>% as.character
    names(dat_results) <- dat_names
  } else {
    dat_names <- names(dat_results)
  }


  dat_levels_df <- tibble(dataset = dat_names) %>% mutate(dat_level = ifelse(is.null(dat_levels), rep(NA, length(dat_names)), dat_levels))


  genes_across <- dat_results %>% lapply(function (x) x$gene) %>% purrr::reduce(union)

  genewise_es <- c(tibble(gene = genes_across) %>% list, dat_results %>% map(function (x) x %>% select(gene, !!rlang::sym(es_var)))) %>% purrr::reduce(left_join, by = "gene")
  colnames(genewise_es) <- c("gene", names(dat_results))
  # genewise_es_n <- genewise_es %>% tidyr::gather("key", "es", -gene) %>% select(-key) %>% tidyr::drop_na() %>% group_by(gene) %>% summarize(n = n()) %>% ungroup

  genewise_sevar <- c(tibble(gene = genes_across) %>% list, dat_results %>% map(function (x) x %>% select(gene, !!rlang::sym(sevar)))) %>% purrr::reduce(left_join, by = "gene")
  colnames(genewise_sevar) <- c("gene", names(dat_results))
  # genewise_var_n <- genewise_var %>% tidyr::gather("key", "var", -gene) %>% select(-key) %>% tidyr::drop_na() %>% group_by(gene) %>% summarize(n = n()) %>% ungroup

  if (!all(genewise_es$gene == genewise_sevar$gene))
    stop("All genes and all numbers of datasets have to be equal in the ES and variance outputs!")


  ma_ready <- genewise_es %>% tidyr::gather("dataset", "es", -gene) %>% left_join(genewise_sevar %>% tidyr::gather("dataset", "sevar", -gene), by = c("gene", "dataset")) %>% tidyr::drop_na() %>% left_join(dat_levels_df, by = "dataset") %>% group_split(gene)
  names(ma_ready) <- sapply(ma_ready, function (x) x[1, "gene"])


  ma_start_time <- Sys.time()

  if (is.null(dat_levels)) {
    ma_results <- map(ma_ready, function (ma_set) {
      list(
        ma_fixed = metafor::rma(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), method = "FE"),
        ma_random = metafor::rma(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), method = "DL")
      )
    })
  } else {
    ma_results <- map(ma_ready, function (ma_set) {
      list(
        ma_fixed = metafor::rma.mv(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), random = ~ 1 | dat_level, dat = ma_set, method = "DL"),
        ma_random = metafor::rma.mv(yi = ma_set$es, vi = ifelse(se_or_variance == "variance", ma_set$sevar, ma_set$sevar^2), random = ~ 1 | dat_level/dataset, dat = ma_set, method = "DL")
      )
    })
  }

  ma_done_time <- Sys.time()
  ma_time_taken <- ma_done_time - ma_start_time

  cat("\nPure meta-analysis took:\n")
  print(ma_time_taken)


  ma_object <- tibble(gene = names(ma_ready)) %>% bind_cols(
    tibble(n_datasets = sapply(ma_ready, nrow), datasets = sapply(ma_ready, function (x) x$dataset %>% list))
  ) %>% bind_cols(
    ma_results %>% map(function (x) {
      tibble(
        fixed_es = x$ma_fixed$beta[1], fixed_se = x$ma_fixed$se, fixed_ci_lb = x$ma_fixed$ci.lb, fixed_ci_ub = x$ma_fixed$ci.ub, fixed_pval = x$ma_fixed$pval,
        random_es = x$ma_random$beta[1], random_se = x$ma_random$se, random_ci_lb = x$ma_random$ci.lb, random_ci_ub = x$ma_random$ci.ub, random_pval = x$ma_random$pval, tau2 = x$ma_random$tau2, QEp = x$ma_random$QEp
      )
    }) %>% bind_rows
  ) %>% filter(n_datasets >= min_num_datasets_to_use_gene) %>% mutate(
    fixed_adj_pval = p.adjust(fixed_pval, method = p_adj_method),
    random_adj_pval = p.adjust(random_pval, method = p_adj_method),
    hybrid_adj_pval = ifelse(QEp < Q_p_cutoff, random_pval, fixed_pval) %>% p.adjust(method = p_adj_method),
    hybrid_es = ifelse(QEp < Q_p_cutoff, random_es, fixed_es),
    hybrid_se = ifelse(QEp < Q_p_cutoff, random_se, fixed_se)
  )

  filtered_adj_p <- min_num_datasets_to_use_gene:max(ma_object$n_datasets) %>% lapply(function (cutoff) {
    ma_object %>% filter(n_datasets >= cutoff) %>% select(gene, fixed_pval, random_pval, QEp) %>%
      transmute(
        gene,
        fixed_adj_pval = p.adjust(fixed_pval, method = p_adj_method),
        random_adj_pval = p.adjust(random_pval, method = p_adj_method),
        hybrid_adj_pval = ifelse(QEp <= Q_p_cutoff, random_pval, fixed_pval) %>% p.adjust(method = p_adj_method)
      )
  })
  filtered_adj_p <- list("fixed_adj_pval", "random_adj_pval", "hybrid_adj_pval") %>% lapply(function (pval) {
    filtered_adj_p %>% purrr::reduce(function (x, y) {left_join(x, y %>% select(gene, !!rlang::sym(pval)), by = "gene")}, .init = ma_object %>% select(gene))
  })
  names(filtered_adj_p) <- c("fixed_adj_pval", "random_adj_pval", "hybrid_adj_pval")

  return(
    list(
      ma = ma_object,
      info = ma_info,
      adj_pval = filtered_adj_p,
      adj_pval_cutoffs = min_num_datasets_to_use_gene:max(ma_object$n_datasets)
    )
  )

}
pharmhax/OMA documentation built on Nov. 1, 2020, 3:51 a.m.