1 Dimensions

Documented in FR.test

#' Friedman-Rafsky (FR) test
#'
#' FR test is a multivariate generalization of nonparametric two-sample test. This function is an implementation
#' with customized options, including a visualization of the minimum spanning tree (MST).
#'
#' @param samp1 Numeric matrix or data frame for Sample 1. Rows are multivariate dimensions, and columns are samples. E.g. gene by cell.
#' @param samp2 Numeric matrix or data frame for Sample 2.
#' @param use.cosine An option if to use cosine distance. Logical variable. By default (\code{FALSE}),
#' Euclidean distance is used.
#' @param binary An option if to use binary values. Logical variable. Default: \code{FALSE}. If \code{TRUE},
#' use \code{binary.cutoff} to dichotomize \code{samp1} and \code{samp2}.
#' @param binary.cutoff Numeric value for binary cutoff. Binary value = 1 if greater than \code{binary.cutoff}, 0 otherwise. Default: \code{2}.
#' @param plot.MST Boolean variable indicating if to plot the minimum spanning tree (MST). Default: \code{FALSE}.
#' @param col Character vector of length two for customized colors of the nodes in MST. Default: \code{c("#F0E442", "#56B4E9")}.
#' @param label.names Character vector of length two for customized names of the two samples. Default: \code{c("Sample 1","Sample 2")}.
#' @param vertex.size,edge.width,... Additional plotting parammeters passed to \code{\link[igraph]{plot.igraph}}. Default: \code{vertex.size=5, edge.width=1}.
#'
#' @return Test statistics and p-values.
#' \item{runs}{Total number of subtrees.}
#' \item{runs.samp1}{Number of subtrees of Sample 1.}
#' \item{runs.samp2}{Number of subtrees of Sample 2.}
#' \item{stat}{The standardized FR statistic.}
#' \item{p.value}{P-value of the FR test.}
#' @export

FR.test <- function(samp1, samp2,
                   ## two options: 1. cosine distance, 2. use binary values
                   use.cosine=FALSE, binary=FALSE, binary.cutoff=2,
                   ## plot minimum spanning tree
                   plot.MST=FALSE, col=c("#F0E442", "#56B4E9"), label.names=c("Sample 1","Sample 2"),
                   vertex.size=5, edge.width=1,
                   ...)
{
  ## data input matrices: rows = multivariate dimensions, columns = samples
  samp1 <- t(samp1)
  samp2 <- t(samp2)
  xx <- as.matrix(samp1)
  yy <- as.matrix(samp2)
  
  ## get sample sizes
  m <- max(ncol(xx), 1)
  n <- max(ncol(yy), 1)
  N <- m+n
  
  ##--- OPTION 2: BINARY VALUES ---##
  if(binary){
    xx <- matrix(as.numeric(xx>binary.cutoff), nrow=nrow(xx))
    yy <- matrix(as.numeric(yy>binary.cutoff), nrow=nrow(yy))
  }
  ##---##
  
  ## column-wise combined data matrix
  dat <- cbind(xx, yy)
  
  ##--- OPTION 1: CONSINE DISTANCE ---##
  if(use.cosine){
    cosmat <- lsa::cosine(dat)
    cosmat[is.na(cosmat)] <- -1 #if one point is at the origin, set the -1,0,1??? cosine value
    distobj <- stats::as.dist(1-cosmat) #large cosine, small distance; and vise versa
  }
  ##---##
  
  ## euclidean distance and MST
  else distobj <- stats::dist(t(dat), method="euclidean", diag=T, upper=T)
  myMST <- ade4::neig2mat(ade4::mstree(distobj))
  
  
  ## count total runs (i.e. total subgraphs)
  bottomleft <- myMST[(m+1):N,1:m]
  runs <- sum(bottomleft)+1
  
  ## count subgraphs for each sample
  bottomright <- myMST[(m+1):N,(m+1):N]
  g.samp2 <- igraph::graph_from_adjacency_matrix(bottomright, mode="upper", weighted=NULL, diag=FALSE)
  runs.samp2 <- igraph::components(g.samp2)$no
  topleft <- myMST[1:m,1:m]
  g.samp1 <- igraph::graph_from_adjacency_matrix(topleft, mode="upper", weighted=NULL, diag=FALSE)
  runs.samp1 <- igraph::components(g.samp1)$no
  ## check
  # runs==sum(runs.samp1, runs.samp2)
  
  ## calculate common nodes
  xsum <- colSums(myMST)
  C <- sum(xsum*(xsum-1))/2
  ## calculate mean and variance
  mu <- 2*m*n/N + 1
  sigma.sq <- (2*m*n/(N*(N-1)))*((2*m*n-N)/N+(C-N+2)*(N*(N-1)-4*m*n+2)/((N-2)*(N-3)))
  ## the standardized FR-stat and p-value
  stat <- (runs-mu)/sqrt(sigma.sq)
  p.value <- stats::pnorm(stat)
  
  ## output
  return(list("statistic"=stat, "p.value"=p.value))
}