R/compute_scores_and_pval.R

Defines functions compute_scores assign_pval adjust_pval

Documented in adjust_pval assign_pval compute_scores

# mage R package

# This file contains functions to:
#   1. Compute the table of scores (both asso. and charac. scores)
#   2. Determine the p-values of MIC Scores
#   3. Adjust p-values using FDR methodology

#' @title compute_scores
#'
#' @description
#' Computes the table of association and characterization scores
#'
#' @param x matrix: a gene expression matrix
#' (rownames corresponding to genes and colnames to cells)
#'
#' @param n.cores integer: nb of cores to be used for parallel processing
#'
#' @return
#' Returns a table with the following columns:
#' \describe{
#'   \item{1:2}{Names of the two genes whose assocation is being assessed}
#'   \item{3}{MIC score, \emph{i.e} the strength of the association between the 2 genes}
#'   \item{4:n}{Characterization scores, informing about the "shape" of the association}
#' }
#'
#' @importFrom utils combn
#' @importFrom stats cor p.adjust
#' @importFrom data.table data.table
compute_scores <- function(x,
                           n.cores = 1) {

  n_genes <- nrow(x);

  # Get unique pairs
  # ----------------------------------------------------------------------
  pairs <- t(combn(nrow(x), 2));
  n_pairs <- nrow(pairs);
  pairs_name <- paste(rownames(x)[pairs[, 1]],
                      rownames(x)[pairs[, 2]],
                      sep = "-");
  pairs_ix <- 1:n_pairs;

  # Compute MINE scores
  # ----------------------------------------------------------------------
  cat(" * Compute the MINE scores for all pairs of variables ...\n");
  cat("     this may take quite some time ...\n");
  MINE_sym_mats <- minerva::mine(t(x), master = 1:n_genes,
                                 n.cores = n.cores,
                                 alpha = 0.6, C = 15);
  n_scores <- length(MINE_sym_mats);
  scores_name <- names(MINE_sym_mats);

  # Aggregate the symetric matrices into a global table
  # where each row is a unique association
  MINE_mat <- matrix(nrow = n_pairs, ncol = n_scores);
  colnames(MINE_mat) <- scores_name;
  for(i in 1:n_scores) {
    MINE_mat[, i] <- unlist(vapply(pairs_ix,
                            function(ix)
                              as.list(MINE_sym_mats[[i]][pairs[ix, 1], pairs[ix, 2]]),
                            FUN.VALUE = c(numeric)));
  }

  # Compute Pearson and Spearman Correlations as well
  # ----------------------------------------------------------------------
  cat(" * Compute the Pearson Correlation for all pairs of variables ...\n");
  PEARSON_cor <- unlist(vapply(pairs_ix,
                               function(ix) as.list(cor(x[pairs[ix, 1], ],
                                                        x[pairs[ix, 2], ],
                                                        method = "pearson")),
                               FUN.VALUE = c(numeric)));
  cat(" * Compute the Spearman Correlation for all pairs of variables ...\n");
  SPEARMAN_cor <- unlist(vapply(pairs_ix,
                                function(ix) as.list(cor(x[pairs[ix, 1], ],
                                                         x[pairs[ix, 2], ],
                                                         method = "spearman")),
                                FUN.VALUE = c(numeric)));

  # Sort MINE_mat by decreasing order of MIC Score
  # ----------------------------------------------------------------------
  MINE_table <- data.frame(MINE_mat);
  sort_ix <- sort(unlist(MINE_table[,1]),
                  decreasing = T, index.return = T)$ix;

  # Write final table, assign customed column names
  # ----------------------------------------------------------------------
  final_table <- data.table(`X gene` = rownames(x)[pairs[sort_ix, 1]],
                            `Y gene` = rownames(x)[pairs[sort_ix, 2]],
                            `MIC (strength)` = MINE_table[sort_ix, ]$`MIC`,
                            `MIC-p^2 (nonlinearity)` = MINE_table[sort_ix, ]$`MICR2`,
                            `MAS (non-monotonicity)` = MINE_table[sort_ix, ]$`MAS`,
                            `MEV (functionality)` = MINE_table[sort_ix, ]$`MEV`,
                            `MCN (complexity)` = MINE_table[sort_ix, ]$`MCN`,
                            `PEARSON p (linear correlation)` = PEARSON_cor[sort_ix],
                            `SPEARMAN rho (rank correlation)` = SPEARMAN_cor[sort_ix]);

  return(final_table);
}


#' @title assign_pval
#'
#' @description
#' Assigns p-values to MIC scores
#'
#' @param MIC_scores numeric vector of MIC scores
#' @param nb_cells nb of cells in the gene expression matrix
#'
#' @return
#' Returns a vector of p-values
#'
#' @details
#' Since the p-value of a given MIC score cannot be computed
#' by a simple mathemical function, \code{assigns_pval} reads
#' the p-value table generated by the authors of the MINE
#' statistics (see \url{http://www.exploredata.net/Downloads/P-Value-Tables})
#' at the closest \emph{sample_size} to \emph{nb_cells}.
#'
#' For each \emph{MIC score}, \code{assign_pval} will then
#' retrieve the corresponding p-value from this table.
#'
assign_pval <- function(MIC_scores,
                        nb_cells) {
  MIC_scores <- round(MIC_scores, 3);

  # Get indexes of each unique value in the MIC score vector
  # ----------------------------------------------------------------------
  uniq_MIC_scores <- unique(MIC_scores);
  uniq_MIC_scores_ix <- list();
  for(i in 1:length(uniq_MIC_scores)) {
    uniq_MIC_scores_ix[[i]] <- which(MIC_scores == uniq_MIC_scores[i]);
  }

  # Get appropriate pval_tab:
  # sample_size must be as close as possible to nb_cells
  # ----------------------------------------------------------------------
  pval_data <- load(system.file("R/sysdata.rda", package = "mage"));
  closest_sample_size_ix <- which.min(abs(as.integer(sample_size) - nb_cells));
  pval_tab <- pval_tables[[closest_sample_size_ix]];

  # Print the absolute difference btw nb_cells and sample_size
  # ----------------------------------------------------------------------
  cat(" * Find pval table with the n closest to nb_cells\n");
  cat(sprintf("   abs(sample_size - nb_cells) = %d\n",
                abs(as.integer(sample_size[closest_sample_size_ix]) - nb_cells)));
  cat("   If this difference is too big, consider ignoring these p-values ...\n");

  # Get the boundaries of the table
  # ----------------------------------------------------------------------
  pval_tab_max_MIC <- pval_tab[1, 1];
  pval_tab_min_MIC <- pval_tab[nrow(pval_tab), 1];
  most_signif_MIC_ix <- which(uniq_MIC_scores > c(pval_tab_max_MIC));
  least_signif_MIC_ix <- which(uniq_MIC_scores < c(pval_tab_min_MIC));

  # Get the p-value assigned to any MIC written in pval_tab
  # ----------------------------------------------------------------------
  pval_tab_MIC_scores <- round(pval_tab[,1], 3);
  pval_tab_uniq_MIC_scores <- unique(pval_tab_MIC_scores);
  pval_tab_uniq_MIC_scores_pval <- rep(0, length(pval_tab_uniq_MIC_scores));
  for(i in 1:length(pval_tab_uniq_MIC_scores)) {
    pval_tab_uniq_MIC_scores_pval[i] <-
      round(pval_tab[which(pval_tab_MIC_scores == pval_tab_uniq_MIC_scores[i])[1], 2], 3);
  }
  pval_tab_uniq_MIC_scores_pval <- unlist(pval_tab_uniq_MIC_scores_pval);

  # Use these references to assign a p-value to each MIC score
  # present in the MIC_scores vector
  # ----------------------------------------------------------------------
  p_values <- rep(1, length(MIC_scores)); # Any unassigned MIC will have a p-value of 1
  first_i = length(most_signif_MIC_ix) + 1;
  last_i = length(uniq_MIC_scores) - length(least_signif_MIC_ix) + 1;
  for(i in first_i:last_i) {
    cur_ix <- which.min(
      abs(pval_tab_uniq_MIC_scores
          - uniq_MIC_scores[i]));
    p_values[uniq_MIC_scores_ix[[i]]] <- pval_tab_uniq_MIC_scores_pval[cur_ix];
  }

  # Assign p-value = 0 to any MIC > max(MIC) present in pval_tab
  # ----------------------------------------------------------------------
  if(length(most_signif_MIC_ix > 0)) {
    for(i in 1:length(most_signif_MIC_ix)) {
      p_values[uniq_MIC_scores_ix[[most_signif_MIC_ix[i]]]] <- 0;
    }
  }

  return(p_values);
}


#' @title adjust_pval
#'
#' @description
#' Adjust p-values using FDR method (Benjamini & Hochberg, 1995)
#'
#' @param pval numeric vector of p-values
#'
#' @return
#' Returns a numeric vector of adjusted p-values
#'
adjust_pval <- function(pval) {
  adjusted_pval <- p.adjust(pval, method = "BH");
  return(adjusted_pval);
}
charles-bernard/mage documentation built on May 14, 2019, 2 a.m.