R/testOneConnectedComponent.R
In DEGraph: Two-sample tests on a graph

Documented in testOneConnectedComponent

## Copyright 2010 Laurent Jacob, Pierre Neuvial and Sandrine Dudoit.

## This file is part of DEGraph.

## DEGraph is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## DEGraph is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with DEGraph.  If not, see <http://www.gnu.org/licenses/>.

#########################################################################/**
## @RdocFunction testOneConnectedComponent
##
## @title "Applies a series of two-sample tests to a connected graph using various statistics"
##
## \description{
##  @get "title".
## }
##
## @synopsis
##
## \arguments{
##   \item{graph}{A \code{\link[=graph-class]{graph}} object.}
##   \item{data}{A '@numeric @matrix (size: number 'p' of genes x number 'n' of samples) of gene expression.}
##   \item{classes}{A @character @vector (length: 'n') of class assignments.}
##   \item{...}{Further arguments to be passed to @see "laplacianFromA".}
##   \item{prop}{A @numeric value, percentage of components retained for Fourier and PCA.}
##   \item{verbose}{If @TRUE, extra information is output.}
## }
##
## \value{
##  A structured @list containing the p-values of the tests, the 
##  \code{\link[=graph-class]{graph}} object of the connected component and the number 
##  of retained Fourier dimensions.
## }
##
## @author
##
## \details{
##   This function performs the test, assuming that all genes
##   in the graph are represented in the expression data set,
##   in order not to have to modify the graph topology.
##
##   Interaction signs are used if available in the graph
##   ('getSignedGraph' is not called here, in order not to
##   have to modify the graph topology.).
##
##   The graph given as input has to have only one
##   connex component.  It can be retrieved from the output of
##   @see "getConnectedComponentList".
## }
##
## \seealso{
##   @see "testOneGraph"
##   @see "getConnectedComponentList"
## }
##
## @examples "../incl/graph.T2.test.Rex"
##
##*/########################################################################

testOneConnectedComponent <- function(graph, data, classes, ..., prop=0.2, verbose=FALSE) {
  ## - - - - - - - - - - - - - - - - - - -
  ## Validate arguments
  ## - - - - - - - - - - - - - - - - - - -
  ## Argument 'classes'
  classes <- Arguments$getNumerics(classes)
  ll <- length(unique(classes))
  if (ll != 2) {
    throw("Number of classes should be 2, not ", ll)
  }
  rm(ll)

  ## Argument 'data'
  cls <- class(data)[1]
  if (cls != "matrix") {
    throw("Arguments 'data' should be a 'matrix', not a ", cls)
  }
  n <- ncol(data)
  nc <- length(classes)
  if (n != nc) {
    throw("Number of samples in 'data' (", n, ") should match number of samples in 'classes' (", nc, ")")
  }
  rm(nc)

  ## Argument 'graph'
  if (!validGraph(graph)) {
    throw("Argument 'graph' is not a valid graph object")
  }
  ## Check that there is only one connex component
  cc <- connectedComp(graph)
  if (length(cc)>1) {
    throw("More than one connex component in graph")
  }
  rm(cc)

  ## Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    cat <- R.utils::cat
    pushState(verbose)
    on.exit(popState(verbose))
  } 

  ## check consistency between gene and node names
  verbose && enter(verbose, "Keeping genes in the graph *and* the expression data set")
  dataGN <- rownames(data)
  graphGN <- nodes(graph)
  mm <- match(graphGN, dataGN)
  ww <- which(is.na(mm))
  ll <- length(ww)
  if (ll) {
    throw(ll, " genes of the graph were not found in the expression data set: ")
  }
  ## sorting (and subsetting if necessary)
  data <- data[mm, ]

  ## sanity check (ordering should be the same now)
  dataGN <- rownames(data)
  stopifnot(all.equal(dataGN, graphGN))
  rm(graphGN)
  rm(dataGN)
  verbose && exit(verbose)

  p <- numNodes(graph)
  geneNames <- rownames(data)

  cls <- sort(unique(classes))
  X1 <- t(data[, classes==cls[1]])
  X2 <- t(data[, classes==cls[2]])

  ## get sign information (if any) and infer from its presence the type of graph
  #signMat <- attr(graph, 'signMat')
  signMat <- graph@graphData$signMat
  if (is.null(signMat)) {
    verbose && cat(verbose, "Unsigned graph")
    ## use adjacency matrix of the corresponding undirected graph
    ##ugraph <- ugraph(graph)
    ##graphAM <- as(ugraph, "graphAM")
    ##adjMat <- graphAM@adjMat
    adjMat <- as(graph,"graphAM")@adjMat
    ## sanity check: symmetry
    ##stopifnot(sum(sum(t(adjMat)!=adjMat))==0)
    mat <- adjMat
  } else {
    verbose && cat(verbose, "Signed graph")
    mat <- signMat
  }

  ##print(mat)
  
  ## Argument 'prop'
  if (is.null(prop)) {
    prop <- c(0.05, 0.1, 0.2, 0.4)
    ndim <- unique(c(1:5, ceiling(p*prop)))
    ndim <- sort(ndim[ndim<=p])
  } else {
    prop <- Arguments$getNumerics(prop, range=c(0, 1))
    ndim <- unique(ceiling(p*prop))
  }

  dg <- rowSums(abs(mat))
  verbose && cat(verbose, "Degrees")
  verbose && str(verbose, dg)
  ## TODO: take multiplicity of eigen value into account using parameter 'k'
  lfA <- laplacianFromA(mat, ..., ltype="meanInfluence", k=1)
  U <- lfA$U

  res <- NULL
  resNames <- NULL

  ## T2 in original space
  verbose && enter(verbose, "Testing")
  verbose && cat(verbose, "Raw T2 (Hotelling)")
  T2H <- try(T2.test(X1, X2))
  if (class(T2H)=="try-error") {
    pH <- NA
  } else {
    pH <- T2H$p.value
    verbose & str(verbose, pH)
  }
  res <- c(res, pH)
  resNames <- c(resNames, "T2")

  ## T2 in Fourier space
  verbose && cat(verbose, "T2 on Fourier components")
  pU <- rep(NA, length=length(ndim))
  names(pU) <- ndim
  for (kk in seq(along=ndim)) {
    kR <- ndim[kk]
    T2U <- graph.T2.test(X1, X2, G=NULL, lfA=lfA, k=kR)
    ##T2U <- T2.test(X1%*%U[, 1:kR], X2%*%U[, 1:kR])
    pU[kk] <- T2U$p.value
  }
  verbose & str(verbose, pU)
  res <- c(res, pU)
  resNames <- c(resNames, paste("T2 (", ndim, " Fourier components)", sep=""))

  ## T2 with PCA, other test statistics
  if (FALSE) {
  verbose && cat(verbose, "T2 on PCA components")
  edX <- svd(rbind(X1, X2))
  E <- edX$v
  pPCA <- rep(NA, length=length(ndim))
  names(pPCA) <- ndim
  for (kk in seq(along=ndim)) {
    kR <- ndim[kk]
    T2PCA <- T2.test(X1%*%E[, 1:kR], X2%*%E[, 1:kR])
    pPCA[kk] <- T2PCA$p.value
  }
  res <- c(res, pPCA)
  resNames <- c(resames, paste("T2 (", ndim, " PCA components)", sep=""))

  verbose & str(verbose, pPCA)

  verbose && cat(verbose, "Adaptive Neyman (Fan)")
  T2AN <- AN.test(X1%*%U, X2%*%U, p)
  pAN <- T2AN$p.value
  verbose & str(verbose, pAN)

  res <- c(res, pAN)
  resNames <- c(resames, "Adaptive Neyman")
  } ## if (FALSE)

  verbose && exit(verbose)

  names(res) <- resNames
  ret <- list(p.value=res, graph=graph, k=ndim)

  ret
}

############################################################################
## HISTORY:
## 2010-10-08
## o Now validating argument 'verbose'.
## 2010-10-01
## o Now manipulates A instead of t(A) (transposition made within
##   laplacianFromA).
## 2010-08-05
## o CLEAN-UP: Now uses 'laplacianFromA'.
## 2010-07-15
## o ROBUSTIFICATION: Now use 'cls <- sort(unique(classes))' to make 'cls'
##   independent of the input order of the label vector.
## 2010-05
## o Created.
############################################################################