R/random_neg_ctrl_cont.R

Defines functions random_neg_ctrl_cont

Documented in random_neg_ctrl_cont

#' Randomized negative control for count data in longdat_cont()
#' @param test_var Internal function argument.
#' @param variable_col Internal function argument.
#' @param fac_var Internal function argument.
#' @param not_used Internal function argument.
#' @param factors Internal function argument.
#' @param data Internal function argument.
#' @param N Internal function argument.
#' @param data_type Internal function argument.
#' @param variables Internal function argument.
#' @param adjustMethod Internal function argument.
#' @param model_q Internal function argument.
#' @param posthoc_q Internal function argument.
#' @param theta_cutoff Internal function argument.
#' @param nonzero_count_cutoff1 Internal function argument.
#' @param nonzero_count_cutoff2 Internal function argument.
#' @param verbose Internal function argument.
#' @importFrom rlang .data
#' @importFrom stats as.formula confint cor.test kruskal.test
#'             na.omit p.adjust wilcox.test
#' @importFrom car Anova
#' @importFrom magrittr '%>%'
#' @import reshape2
#' @import tidyr
#' @import dplyr
#' @import glmmTMB
#' @import tibble
#' @import dplyr
#' @name random_neg_ctrl_cont
utils::globalVariables(c("non_zero_count", "NB_theta"))

random_neg_ctrl_cont <- function(test_var, variable_col, fac_var, not_used,
                                 factors, data, N, data_type, variables,
                                 adjustMethod, model_q, posthoc_q,
                                 theta_cutoff, nonzero_count_cutoff1,
                                 nonzero_count_cutoff2, verbose) {
  ######### Randomize the raw data first
  # Shuffle the rows randomly
  value_for_random <- data[ , variable_col:ncol(data)]
  value_random <- value_for_random[sample(nrow(value_for_random)), ]
  data_randomized <- as.data.frame(cbind(data[ , 1:(variable_col-1)],
                                         value_random))

  # Remove all the symbols from variables
  colnames(data_randomized)[variable_col:ncol(data_randomized)] <-
    fix_name_fun(colnames(data_randomized)[variable_col:ncol(data_randomized)])

  # Melt randomized data
  predictor_names <- (colnames(data_randomized))[1: (variable_col - 1)]
  melt_data_random <- reshape2::melt (data_randomized, id = predictor_names)
  # Omit the rows whose value column equals to NA
  melt_data_random <- melt_data_random %>% tidyr::drop_na(.data$value)
  # Remove all dots in the bacteria name or it will cause problem
  melt_data_random$variable <- gsub(".", "_", melt_data_random$variable,
                                    fixed = TRUE)

  # Make sure that all the columns are in the right class
  # Columns mentioned in fac_var,
  # and the second last column in melt data are factors
  # Columns not in fac_var, and the last column in
  # melt data are numerical numbers
  "%notin%" <- Negate("%in%")
  num_var <- c(which(1:(ncol(melt_data_random)-2) %notin% fac_var),
               ncol(melt_data_random))
  fac_var <- c(fac_var, ncol(melt_data_random)-1)
  for (i in fac_var) {
    melt_data_random[ ,i] <- as.factor(melt_data_random[ ,i])
  }
  for (i in num_var) {
    melt_data_random[ ,i] <- as.numeric(as.character(melt_data_random[ ,i]))
  }

  # Remove the not-used columns in melt_data
  if (!is.null(not_used)) {
    melt_data_random <- melt_data_random %>% dplyr::select(-c(not_used))
  }
  # Change the first column name of melt_data to "Individual"
  colnames(melt_data_random)[1] <- "Individual"

  ######### Then do the model test
  Ps_neg_ctrl <- as.data.frame(matrix(data = NA, nrow = N, ncol = 2))
  rownames(Ps_neg_ctrl) <- gsub(".", "_", variables, fixed = TRUE)
  colnames(Ps_neg_ctrl) <- c("Neg_ctrl_model_p", "Signal_of_CI_signs")
  Theta_random <- c()

  suppressWarnings(
    for (i in 1:N) { # loop through all variables
      aVariable <- variables[i]
      if (verbose == TRUE) {print(i)}
      subdata_random <- subset(melt_data_random, variable == aVariable)
      tryCatch({
        # Negative binomial
        fmla2 <- as.formula(paste("value ~ (1| Individual) +", test_var))
        m3 <- glmmTMB::glmmTMB(formula = fmla2, data = subdata_random,
                               family = nbinom2, na.action = na.omit,
                               REML = FALSE)

        # Extract dispersion theta out of model
        Theta_random[i] <- glmmTMB::sigma(m3)

        # Wald Chisq test
        Ps_neg_ctrl[i, 1] <-
          car::Anova(m3, type=c("II"),  test.statistic=c("Chisq"))$"Pr(>Chisq)"

        # Calculate confidence interval for test_var
        # Followed by the determination of the signs. For "- -" or "+ +",
        # it means that the doesn't span 0.
        # But "- +" means that the CI spans 0.
        # Here I sum the signs over the row, sum != 0 means it's OK.
        remove(ci)
        ci <- as.data.frame(confint(m3))
        ci <- ci[str_detect(row.names(ci), test_var), 1:2]
        if (nrow(ci) > 1) {
          if (any(apply(sign(ci), 1, sum, na.rm = TRUE) != 0)) {
            Ps_neg_ctrl[i, 2] <- "Good"
          } else {
            Ps_neg_ctrl[i, 2] <- "Bad"
          }
        } else if (nrow(ci) == 1) {
          if (sign(ci)[1] == sign(ci)[2]) {
            Ps_neg_ctrl[i, 2] <- "Good"
          } else {
            Ps_neg_ctrl[i, 2] <- "Bad"
          }
        }
      }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
    })
  Ps_neg_ctrl <- Ps_neg_ctrl %>%
    tibble::rownames_to_column() %>%
    dplyr::mutate(NB_theta = Theta_random) %>%
    tibble::column_to_rownames()

  ######### Post-hoc test and effect size
  # Here uses Spearman's correlation
  p_poho_random <- as.data.frame(matrix(nrow = length(variables), ncol = 1))
  assoc_random <- as.data.frame(matrix(nrow = length(variables), ncol = 1))

  for (i in 1:N) { # loop through all variables
    if (verbose == TRUE) {print(i)}
    cVariable <- variables[i]
    subdata_random <- subset(melt_data_random, variable == cVariable)
    # Here set the "test_var" to numeric
    subdata_random[ , test_var] <- as.numeric(subdata_random[ , test_var])
    e <- cor.test(subdata_random[ , test_var], subdata_random$value,
                  method = "spearman")
    p_c_random <- e$p.value
    a_c_random <- e$estimate
    p_poho_random[i, 1] <- p_c_random
    assoc_random[i, 1] <- a_c_random
  }
  row.names(p_poho_random) <- variables
  row.names(assoc_random) <- variables
  colnames(p_poho_random) <- "p_post-hoc"
  colnames(assoc_random) <- "association"

  #### Remove high theta and low prevalence ones from randomized result
  absolute_sparsity_random <- c()
  for (i in seq_len(ncol(value_random))) {
    absolute_sparsity_random[i] <- sum(value_random[ , i] == 0, na.rm = TRUE)
  }

  non_zero_count_randomized <- nrow(data_randomized) - absolute_sparsity_random
  Ps_neg_ctrl <- Ps_neg_ctrl %>%
    tibble::rownames_to_column() %>%
    dplyr::mutate(non_zero_count = non_zero_count_randomized) %>%
    tibble::column_to_rownames()

  bac_exclude_1_random <-
    subset(Ps_neg_ctrl, non_zero_count <= nonzero_count_cutoff1 &
             NB_theta >= theta_cutoff)
  bac_exclude_2_random <-
    subset(Ps_neg_ctrl, non_zero_count <= nonzero_count_cutoff2)
  bac_exclude_random <- unique(c(rownames(bac_exclude_1_random),
                                 rownames(bac_exclude_2_random)))
  bac_include_random <-
    rownames(Ps_neg_ctrl)[!rownames(Ps_neg_ctrl) %in% bac_exclude_random]

  Ps_neg_ctrl_filterd <- Ps_neg_ctrl %>%
    tibble::rownames_to_column() %>%
    dplyr::filter(.data$rowname %in% bac_include_random) %>%
    tibble::column_to_rownames()

  p_poho_random_filtered <- p_poho_random %>%
    tibble::rownames_to_column() %>%
    dplyr::filter(.data$rowname %in% bac_include_random) %>%
    tibble::column_to_rownames()

  assoc_random_filtered <- assoc_random %>%
    tibble::rownames_to_column() %>%
    dplyr::filter(.data$rowname %in% bac_include_random) %>%
    tibble::column_to_rownames()

  ####### FDR correction
  adjust_fun <- function(x) p.adjust(p = x, method = adjustMethod)
  Ps_neg_ctrl_fdr <- adjust_fun(Ps_neg_ctrl_filterd$Neg_ctrl_model_p)
  Ps_neg_ctrl_filterd <- Ps_neg_ctrl_filterd %>%
    tibble::rownames_to_column() %>%
    dplyr::mutate(P_fdr = Ps_neg_ctrl_fdr) %>%
    tibble::column_to_rownames()

  ####### Write randomized control table
    signal_neg_ctrl_tbl <-
      data.frame(matrix(nrow = length(row.names(Ps_neg_ctrl_filterd)),
                        ncol = 1, data = NA, byrow = FALSE))
    colnames(signal_neg_ctrl_tbl) <- "Signal"
    rownames(signal_neg_ctrl_tbl) <- rownames(Ps_neg_ctrl_filterd)
    for (i in seq_len(nrow(Ps_neg_ctrl_filterd))) {
        signal_neg_ctrl_tbl[i, 1] <-
          ifelse(Ps_neg_ctrl_filterd$P_fdr[i] < model_q &
                   p_poho_random_filtered$`p_post-hoc`[i] < posthoc_q &
                   !is.na(Ps_neg_ctrl_filterd$P_fdr[i]) &
                   !is.na(p_poho_random_filtered$`p_post-hoc`[i]),
                 yes = "False_positive", no = "Negative")
      }
    result_neg_ctrl <- cbind(Ps_neg_ctrl_filterd[ ,c(2, 5)],
                             signal_neg_ctrl_tbl,
                             p_poho_random_filtered$`p_post-hoc`,
                             assoc_random_filtered)
    colnames(result_neg_ctrl) <- c("Signal_of_CI_signs", "Model_q", "Signal",
                                   "Posthoc_q", "Effect_size")

    # Change the rownames to avoid confusion for the users
    rownames(result_neg_ctrl) <- paste0("Randomized_feature_",
                                        seq_len(nrow(result_neg_ctrl)))

    result_neg_ctrl_sig <- result_neg_ctrl %>%
      dplyr::filter(.data$Signal == "False_positive" &
                      .data$Signal_of_CI_signs == "Good") %>%
      dplyr::select(-1)
    false_pos_count <- nrow(result_neg_ctrl_sig)

    result_neg_ctrl_sig <- result_neg_ctrl_sig %>%
      tibble::rownames_to_column("Randomized_feature")

    if (nrow(result_neg_ctrl_sig) > 0) {
    #write.table(x = result_neg_ctrl_sig, file =
      # paste0("_randomized_control.txt"), sep = "\t",
    #            row.names = T, col.names = NA, quote = F)
    }
    return(list(result_neg_ctrl, false_pos_count))
  }

Try the LongDat package in your browser

Any scripts or data that you put into this service are public.

LongDat documentation built on April 4, 2025, 1:07 a.m.