momr: Mining Metaomics Data (MetaOMineR)

Documented in extractSignificant phenoPairwiseRelations

#' \code{testRelations}
#' @title testRelations
#' @export
#' @description This function applies a statistical test either a correlation 
#'      (spearman, pearson), wilcoxon or t.test as a function of a given phenotype. 
#'      It will return a matrix of probabilities p and q values along with the 
#'      correlation coefficient or the enrichment variable when a binary parameter.
#' @author Edi Prifti & Emmanuelle Le Chatelier
#' @param data : frequency matrix with gene_ids in the rownames
#' @param trait : a vector with the trait to test, binary or numerical variable
#' @param type : a character string indicating the type of test to be applied
#' @param restrict : an optional logical vector to select a subset of the 
#'      samples to perform the test default restrict = rep(TRUE, ncol(data)) 
#'      i.e. all the samples are selected
#' @param multiple.adjust : type of multiple adjustment default is "BH" i.e. 
#'      Benjamini & Hochberg method
#' @param paired : logical with default FALSE wether the test should be paired or not
#' @param debug : default FALSE, when TRUE the progress is printed each 1000 steps
#' @return a matrix with analytical results (correlation tests) indicating rho, 
#'      rho2, p and q values for each parameter tested along with the status of the test
#' @note New add on taking into account a trait for correlation, when it is a two 
#'      class variable with the same number of elements a correlation between 
#'      both groups is performed
testRelations <- function (data,
                           trait,
                           type,
                           restrict = rep(TRUE, ncol(data)),
                           multiple.adjust = "BH",
                           paired = FALSE,
                           debug = FALSE)
{
  # Sanity checks
  if (length(trait) != ncol(data))
  {
    stop(
      "testRelations: please make sure that dimensions of the trait correspond to that of the data."
    )
  }
  
  if (length(trait) != sum(restrict))
  {
    stop(
      "testRelations: please make sure that dimensions of the trait corresponds to the restrict selection."
    )
  }
  
  if (any(class(data) != "matrix"))
  {
    print(paste("testRelations: Converting", class(data), "in matrix"))
    data <- as.matrix(data)
  }
  
  # prepare an empty result table.
  res <- as.data.frame(matrix(NA, nrow = nrow(data), ncol = 5))
  rownames(res) <- rownames(data)
  colnames(res) <- c("rho", "rho2", "p", "q", "status")
  
  trait.val <- names(table(trait))
  
  if (type != "spearman" & type != "pearson")
  {
    if (length(table(trait)) != 2)
    {
      stop(
        "testRelations: sorry, you can't use this test. The trait should contain only two categories!"
      )
    }
    
    # Categorical test
    if (type == "wilcoxon" | type == "t.test")
    {
      if (type == "wilcoxon")
      {
        if (debug)
        {
          print("Executing Wilcoxon test")
        }
        
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          tmp <- wilcox.test(data[i, restrict] ~ trait[restrict], paired = paired)
          res[i, "p"] <- tmp$p.value
          
          if (res[i, "p"] == "NaN")
          {
            res[i, "status"] <- NA
          } else
          {
            if (mean(data[i, restrict][trait == trait.val[1]]) >
                mean(data[i, restrict][trait == trait.val[2]]))
            {
              res[i, "status"] <- trait.val[1]
            } else
            {
              res[i, "status"] <- trait.val[2]
            }
          }
        } # end for
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      } # end wilcoxon
      else
      {
        if (debug)
        {
          print("Executing T test")
        }
        
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          tmp <- t.test(data[i, restrict] ~ trait[restrict], paired = paired)
          res[i, "p"] <- tmp$p.value
          
          if (res[i, "p"] == "NaN")
          {
            res[i, "status"] <- NA
          } else
          {
            if (mean(data[i, restrict][trait == trait.val[1]]) >
                mean(data[i, restrict][trait == trait.val[2]]))
            {
              res[i, "status"] <- trait.val[1]
            } else
            {
              res[i, "status"] <- trait.val[2]
            }
          }
        } # end for
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      }
    }
    else
    {
      stop(
        "testRelations: Sorry, your test does not exist! Available : spearman, pearson, t.test and wilcoxon"
      )
    }
  }
  else {
    if (type == "spearman")
    {
      if (paired)
      {
        if (debug)
        {
          print("Entering correlation mode between two classes")
        }
        
        if (length(table(trait)) != 2 |
            table(trait)[1] != table(trait)[2])
        {
          stop(
            "testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence"
          )
        }
        cl1 <- names(table(trait)[1])
        cl1.ind <- trait == cl1
        cl2 <- names(table(trait)[2])
        cl2.ind <- trait == cl2
        if (debug)
        {
          print("Executing Spearman correlation")
        }
        
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          
          tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "spearman")
          res[i, 1] <- tmp$r[1, 2]
          
          if (!is.na(res[i, 1]))
          {
            if (res[i, 1] > 0)
            {
              res[i, 5] <- "POS"
            }
            else
            {
              res[i, 5] <- "NEG"
            }
          }
          res[i, 2] <- res[i, 1] ^ 2
          res[i, 3] <- tmp$P[1, 2]
        }
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      } # end if paired
      else
      {
        if (debug)
        {
          print("Executing Spearman correlation")
        }
        
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          
          tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "spearman")
          res[i, 1] <- tmp$r[1, 2]
          
          if (!is.na(res[i, 1]))
          {
            if (res[i, 1] > 0)
            {
              res[i, 5] <- "POS"
            }
            else
            {
              res[i, 5] <- "NEG"
            }
          }
          res[i, 2] <- res[i, 1] ^ 2
          res[i, 3] <- tmp$P[1, 2]
        }
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      } # end else unpaired
    } # end spearman
    else {
      if (paired)
      {
        if (debug)
        {
          print("Entering correlation mode between two classes")
        }
        
        if (length(table(trait)) != 2 |
            table(trait)[1] != table(trait)[2])
        {
          stop(
            "testRelations: Sorry, trait does not seem to be a 2-level categorical variable or with the same prevalence"
          )
        }
        
        cl1 <- names(table(trait)[1])
        cl1.ind <- trait == cl1
        cl2 <- names(table(trait)[2])
        cl2.ind <- trait == cl2
        print("Executing Pearson correlation")
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          tmp <- Hmisc::rcorr(data[i, cl1.ind], data[i, cl2.ind], type = "pearson")
          res[i, 1] <- tmp$r[1, 2]
          
          if (!is.na(res[i, 1]))
          {
            if (res[i, 1] > 0)
            {
              res[i, 5] <- "POS"
            }
            else
            {
              res[i, 5] <- "NEG"
            }
          }
          res[i, 2] <- res[i, 1] ^ 2
          res[i, 3] <- tmp$P[1, 2]
        }
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      } # end if paired
      else
      {
        if (debug)
        {
          print("Executing Pearson correlation")
        }
        for (i in 1:nrow(data))
        {
          if (i %% 1000 == 0 & !debug)
          {
            print(i)
          }
          tmp <- Hmisc::rcorr(data[i, restrict], trait[restrict], type = "pearson")
          res[i, 1] <- tmp$r[1, 2]
          
          if (!is.na(res[i, 1]))
          {
            if (res[i, 1] > 0)
            {
              res[i, 5] <- "POS"
            }
            else
            {
              res[i, 5] <- "NEG"
            }
          }
          res[i, 2] <- res[i, 1] ^ 2
          res[i, 3] <- tmp$P[1, 2]
        }
        res[, 4] <- p.adjust(res[, "p"], method = multiple.adjust)
      }
    }
  }
  return(res)
}



#' \code{hierClust}
#' @title hierClust
#' @export
#' @import gplots
#' @export
#' @description This function computes the pairwise distance between samples 
#'      and computes a hierarchical clustering that is further depicted as a 
#'      heatmap graphic. The distance is computed as 1-correlation.
#' @author Edi Prifti
#' @param data : frequency matrix with gene_ids in the rownames
#' @param side : the distance can be performed on the columns or on the rows
#' @param dist : the type of distance used. By default this is correlation based similarity
#' @param cor.type : when correlation matrix, the default is Spearman
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param side.col.c : a vector of colors to be applied in the columns, usually depicting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depicting a class
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierarchical clustering
#' @return it will return a list of three variables, the correlation matrix, the distance matrix and the hclust object
#' @note updated hierClust functions by elechat april 7th 2015 added options SideColors added + spearman == pearson(rank)
hierClust <- function (data,
                       side = "col",
                       dist = "correlation",
                       cor.type = "spearman",
                       hclust.method = "ward.D",
                       side.col.c = NULL,
                       side.col.r = NULL,
                       plot = TRUE) {
  res <- NULL
  if (side == "col") {
    if (sum(colSums(data, na.rm = TRUE) == 0) > 0) {
      warning("Warning there are samples with no signal that need to be discarded")
    }
  }
  else {
    if (sum(rowSums(data, na.rm = TRUE) == 0) > 0) {
      warning("Warning there are genes with no signal that need to be discarded")
    }
  }
  if (dist == "correlation") {
    if (side != "col") {
      # if rows we need to transpose
      data <- t(data)
    }
    if (cor.type == "spearman") {
      data <- apply(data, 2, rank)
    } # compute the rank for spearman
    mat.rho <- Hmisc::rcorr(data, type = "pearson")$r
    diag(mat.rho) <- NA # don't use the diagonal, correlation with themselves
    # compute the distance as 1-correlation
    mat.dist <- as.dist(1 - mat.rho)
    mat.hclust <- hclust(d = mat.dist, method = hclust.method)
    diag(mat.rho) <- 1 # add the diag again
    if (plot) {
      if (is.null(side.col.c) &
          is.null(side.col.r)) {
        # if none is provided
        heatmap.2(
          mat.rho,
          scale = "none",
          trace = "none",
          Rowv = as.dendrogram(mat.hclust),
          Colv = as.dendrogram(mat.hclust),
          margins = c(6, 6),
          cex.axis = 0.7
        )
      } else {
        if (is.null(side.col.c)) {
          # if column class is not provided but the row is
          if (length(side.col.r) != ncol(mat.rho)) {
            warning("side.col.r must be a character vector of entry length")
          }
          heatmap.2(
            mat.rho,
            scale = "none",
            trace = "none",
            Rowv = as.dendrogram(mat.hclust),
            Colv = as.dendrogram(mat.hclust),
            margins = c(6, 6),
            cex.axis = 0.7,
            RowSideColors = side.col.r
          )
        } else{
          # if row class is not provided but the culumn is
          if (is.null(side.col.r)) {
            if (length(side.col.c) != ncol(mat.rho)) {
              warning("side.col.c must be a character vector of entry length")
            }
            heatmap.2(
              mat.rho,
              scale = "none",
              trace = "none",
              Rowv = as.dendrogram(mat.hclust),
              Colv = as.dendrogram(mat.hclust),
              margins = c(6, 6),
              cex.axis = 0.7,
              ColSideColors = side.col.c
            )
          } else {
            # if both classes are provided
            if (length(side.col.r) != ncol(mat.rho)) {
              warning("side.col.r must be a character vector of entry length")
            }
            if (length(side.col.c) != ncol(mat.rho)) {
              warning("side.col.c must be a character vector of entry length")
            }
            heatmap.2(
              mat.rho,
              scale = "none",
              trace = "none",
              Rowv = as.dendrogram(mat.hclust),
              Colv = as.dendrogram(mat.hclust),
              margins = c(6, 6),
              cex.axis = 0.7,
              ColSideColors = side.col.c,
              RowSideColors = side.col.r
            )
          }
        }
      }
    }
    res <- list(mat.rho = mat.rho,
                mat.dist = mat.dist,
                mat.hclust = mat.hclust)
  } else {
    stop("Other distances are not yet implemented, only 1-correlation is used here !")
  }
  return(res)
}


#' \code{filt.hierClust}
#' @title filt.hierClust
#' @export
#' @description This function takes as input a square similarity matrix and searches for clusters of samples with strong associations
#' and extracts the sub matrix with the closely related sampless. Only positive correlations are considered here.
#' @import gplots
#' @author Emmanuelle Le Chatelier & Edi Prifti
#' @param mat.rho : square correlation matrix with ids (can be used for also other than just samples)
#' @param hclust.method : the hierarchical clustering method, by default it is the ward.D method
#' @param margins : change margins of the graph (default = c(6, 6))
#' @param side.col.c : a vector of colors to be applied in the columns, usually depicting a class
#' @param side.col.r : a vector of colors to be applied in the rows, usually depicting a class
#' @param size : the number of samples in the resulting ordered matrix
#' @param plot : logical default TRUE. It will plot the heatmap of the similarity with the hierarchical clustering
#' @param filt : default is 0.5 and is the filtering threshold to be applied
#' @return it will return a matrix with samples in rows and their closely related ones on the columns along with the
#' correlation score.
filt.hierClust <- function (mat.rho,
                            hclust.method = "ward",
                            margins = c(6, 6),
                            side.col.c = NULL,
                            # margins added as option
                            side.col.r = NULL,
                            size = 10,
                            plot = TRUE,
                            filt = 0.5)
{
  rho <- mat.rho
  diag(mat.rho) <- 0
  if ((nrow(mat.rho) - 1) < size)
  {
    warning(paste("There are less than", size, "samples"))
    size <- nrow(mat.rho) - 1
  }
  
  res <- as.data.frame(matrix(NA, ncol(mat.rho), size * 2))
  rownames(res) <- colnames(mat.rho)
  colnames(res) <- paste(c("Hit", "Hit_rho"), sort(c(1:size, 1:size)), sep = "_")
  for (i in 1:nrow(mat.rho))
  {
    tmp <- sort(mat.rho[i, ], decreasing = T)
    tmp <- tmp[1:size]
    res[i, seq(1, size * 2, by = 2)] <- names(tmp)
    res[i, seq(2, size * 2, by = 2)] <- round(tmp, 3)
    sel <- colnames(res)[res[i, ] == 0]
    sel <- gsub("rho_", "", sel)
    res[i, sel] <- 0
  }
  N <- apply(mat.rho, 1, max)
  N <- N > filt
  mat.rho <- rho[N, N]
  if (all(dim(mat.rho) != 0))
  {
    if (!is.null(side.col.c))
    {
      side.col.c <- side.col.c[N]
    }
    if (!is.null(side.col.r))
    {
      side.col.r <- side.col.r[N]
    }
    diag(mat.rho) <- NA
    mat.dist <- as.dist(1 - mat.rho)
    mat.hclust <- hclust(d = mat.dist, method = hclust.method)
    diag(mat.rho) <- 1
    if (plot)
    {
      if (is.null(side.col.c) & is.null(side.col.r))
      {
        heatmap.2(
          mat.rho,
          scale = "none",
          trace = "none",
          Rowv = as.dendrogram(mat.hclust),
          Colv = as.dendrogram(mat.hclust),
          margins = margins,
          cex.axis = 0.7
        )
      }
      else {
        if (is.null(side.col.c))
        {
          if (length(side.col.r) != ncol(mat.rho))
          {
            warning("side.col.r must be a character vector of entry length")
          }
          heatmap.2(
            mat.rho,
            scale = "none",
            trace = "none",
            Rowv = as.dendrogram(mat.hclust),
            Colv = as.dendrogram(mat.hclust),
            margins = margins,
            cex.axis = 0.7,
            RowSideColors = side.col.r
          )
        }
        else
        {
          if (is.null(side.col.r))
          {
            if (length(side.col.c) != ncol(mat.rho))
            {
              warning("side.col.c must be a character vector of entry length")
            }
            heatmap.2(
              mat.rho,
              scale = "none",
              trace = "none",
              Rowv = as.dendrogram(mat.hclust),
              Colv = as.dendrogram(mat.hclust),
              margins = margins,
              cex.axis = 0.7,
              ColSideColors = side.col.c
            )
          }
          else
          {
            if (length(side.col.r) != ncol(mat.rho))
            {
              warning("side.col.r must be a character vector of entry length")
            }
            if (length(side.col.c) != ncol(mat.rho))
            {
              warning("side.col.c must be a character vector of entry length")
            }
            heatmap.2(
              mat.rho,
              scale = "none",
              trace = "none",
              Rowv = as.dendrogram(mat.hclust),
              Colv = as.dendrogram(mat.hclust),
              margins = margins,
              cex.axis = 0.7,
              ColSideColors = side.col.c,
              RowSideColors = side.col.r
            )
          }
        }
      }
    }
  }
  else
  {
    warning(paste("There are no related samples above the threshold", filt))
  }
  return(res)
}


#' \code{lmp}
#' @title lmp
#' @description This function will extract the p-value from a linear model object. It is used by phenoPairwiseRelations
#' @author Edi Prifti
#' @param modelobject : a linear model object as produced by lm()
#' @return a p-value
lmp <- function (modelobject) 
{
  if (class(modelobject) != "lm")
    stop("Not an object of class 'lm' ")
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1], f[2], f[3], lower.tail = F)
  attributes(p) <- NULL
  return(p)
}

#' \code{phenoPairwiseRelations}
#' @title phenoPairwiseRelations
#' @description This function will compute all the relations between different variables adapting different statistical tests as a
#' function of the data type. It will adjust the p-value matrix for multiple testing.
#' @author Edi Prifti
#' @param data : bioclinical data with variables on the rows and samples on the columns
#' @param adjust : method to adjust for multiple testing (default="BH)
#' @param verbose : default=FALSE. If TRUE information will be printed to follow the progression.
#' @return a list of two matrixes containing the p-values and the multiple testing adjustment.
phenoPairwiseRelations <- function(data,
                                   adjust = "BH",
                                   verbose = FALSE) 
{
  #require(nortest)
  #require(Hmisc)
  data.relations <- matrix(NA, ncol(data), ncol(data))
  colnames(data.relations) <- colnames(data)
  rownames(data.relations) <- colnames(data)
  for (i in 1:ncol(data)) {
    if (verbose)
      print(paste(i, colnames(data)[i]))
    for (j in 1:ncol(data)) {
      if (verbose)
        print(paste(j, colnames(data)[j]))
      x <- data[, i]
      y <- data[, j]
      # test weather the variables are factors
      # a) both factors
      if (is.factor(x)) {
        if (is.factor(y)) {
          # perfom a chisquare
          if (nrow(table(x, y)) == 1 |
              ncol(table(x, y)) == 1 | sum(table(x, y)) == 0) {
            # do nothing
          } else{
            data.relations[i, j] <- chisq.test(table(x, y))$p.value
          }
        } else{
          # b) x is a factor but not y
          # test for normality
          if (nortest::lillie.test(y)$p.value > 0.05) {
            #if normal
            if (length(table(x)) > 1) {
              if (length(table(x)) == 2) {
                if (all(colSums(table(y, x)) > 1)) {
                  data.relations[i, j] <- t.test(y ~ x)$p.value
                }
              } else{
                if (sum(table(y, x)) > 1 &
                    length(unique(x[!is.na(x)])) != 1 & sum(colSums(table(y, x)) > 0) > 1) {
                  tmp <- lmp(lm(y ~ x))
                  if (!is.null(tmp)) {
                    data.relations[i, j] <- tmp
                  }
                }
              }
            }
          } else{
            # not normal
            # run anova
            if (length(table(x)) > 1) {
              if (length(table(x)) == 2) {
                # should test here the distribution of y if normal than t.test otherwise wilcox
                if (all(colSums(table(y, x)) > 1)) {
                  data.relations[i, j] <- wilcox.test(y ~ x)$p.value
                }
              } else{
                if (sum(table(y, x)) > 1 &
                    length(unique(x[!is.na(x)])) != 1 & sum(colSums(table(y, x)) > 0) > 1) {
                  tmp <- kruskal.test(y ~ x)
                  if (!is.null(tmp)) {
                    data.relations[i, j] <- tmp$p.value
                  }
                }
              }
            }
          }
        }
      } else{
        # x is not a factor
        if (is.factor(y)) {
          # but y is a factor
          # test for normality
          if (nortest::lillie.test(x)$p.value > 0.05) {
            #if normal
            if (length(table(y)) > 1) {
              if (length(table(y)) == 2) {
                if (all(colSums(table(x, y)) > 1)) {
                  data.relations[i, j] <- t.test(x ~ y)$p.value
                }
              } else{
                if (sum(table(x, y)) > 1 &
                    length(unique(y[!is.na(y)])) != 1 & sum(colSums(table(x, y)) > 0) > 1) {
                  tmp <- lmp(lm(x ~ y))
                  if (!is.null(tmp)) {
                    data.relations[i, j] <- tmp
                  }
                }
              }
            }
          } else{
            # non normal
            if (length(table(y)) > 1) {
              if (length(table(y)) == 2) {
                if (all(colSums(table(x, y)) > 1)) {
                  data.relations[i, j] <- wilcox.test(x ~ y)$p.value
                }
              } else{
                if (sum(table(x, y)) > 1 &
                    length(unique(y[!is.na(y)])) != 1 & sum(colSums(table(x, y)) > 0) > 1) {
                  tmp <- kruskal.test(x ~ y)
                  if (!is.null(tmp)) {
                    data.relations[i, j] <- tmp$p.value
                  }
                }
              }
            }
          }
        } else{
          # if both are numeric, run a correlation by testing for normality
          if (length(x[!is.na(x)]) > 4 & length(x[!is.na(y)]) > 4) {
            if (nortest::lillie.test(x)$p.value < 0.05 |
                nortest::lillie.test(y)$p.value < 0.05) {
              # use spearman
              data.relations[i, j] <- Hmisc::rcorr(x, y, type = "spearman")$P[1, 2]
            } else{
              # use pearson
              data.relations[i, j] <- Hmisc::rcorr(x, y, type = "pearson")$P[1, 2]
            }
          }
        }
      }
    }
  }
  # adjust for multiple testing
  data.relations.q <- matrix(p.adjust(data.relations , method = adjust),
                             nrow = nrow(data.relations))
  
  colnames(data.relations.q) <- colnames(data.relations)
  rownames(data.relations.q) <- rownames(data.relations)
  return(list(p = data.relations, q = data.relations.q))
}


#' \code{extractSignificant}
#' @export
#' @title extractSignificant
#' @description This function will extract a list of vectors p- or q-values from an object produced by phenoPairwiseRelations.
#' @author Edi Prifti
#' @param relation.matrix : a matrix of p- produced by or q-values by phenoPairwiseRelations()
#' @param interest : a vector of variable names of interest.
#' @param threshold : default 0.05 needed to select significant relations
#' @return a list of vectors containing p-values or q-values along with the names of the variables.
extractSignificant <- function(relation.matrix, interest, threshold = 0.05) 
{
  res <- list()
  for (i in 1:length(interest)) {
    tmp.val <- relation.matrix[, interest[i]]
    tmp <- (tmp.val < threshold)
    tmp[is.na(tmp)] <- FALSE
    res[[i]] <- tmp.val[tmp]
  }
  names(res) <- interest
  return(res)
}


#' End of section and file