Nothing
#' Friedman-Rafsky (FR) test
#'
#' FR test is a multivariate generalization of nonparametric two-sample test. This function is an implementation
#' with customized options, including a visualization of the minimum spanning tree (MST).
#'
#' @param samp1 Numeric matrix or data frame for Sample 1. Rows are multivariate dimensions, and columns are samples. E.g. gene by cell.
#' @param samp2 Numeric matrix or data frame for Sample 2.
#' @param use.cosine An option if to use cosine distance. Logical variable. By default (\code{FALSE}),
#' Euclidean distance is used.
#' @param binary An option if to use binary values. Logical variable. Default: \code{FALSE}. If \code{TRUE},
#' use \code{binary.cutoff} to dichotomize \code{samp1} and \code{samp2}.
#' @param binary.cutoff Numeric value for binary cutoff. Binary value = 1 if greater than \code{binary.cutoff}, 0 otherwise. Default: \code{2}.
#' @param plot.MST Boolean variable indicating if to plot the minimum spanning tree (MST). Default: \code{FALSE}.
#' @param col Character vector of length two for customized colors of the nodes in MST. Default: \code{c("#F0E442", "#56B4E9")}.
#' @param label.names Character vector of length two for customized names of the two samples. Default: \code{c("Sample 1","Sample 2")}.
#' @param vertex.size,edge.width,... Additional plotting parammeters passed to \code{\link[igraph]{plot.igraph}}. Default: \code{vertex.size=5, edge.width=1}.
#'
#' @return Test statistics and p-values.
#' \item{runs}{Total number of subtrees.}
#' \item{runs.samp1}{Number of subtrees of Sample 1.}
#' \item{runs.samp2}{Number of subtrees of Sample 2.}
#' \item{stat}{The standardized FR statistic.}
#' \item{p.value}{P-value of the FR test.}
#' @export
FR.test <- function(samp1, samp2,
## two options: 1. cosine distance, 2. use binary values
use.cosine=FALSE, binary=FALSE, binary.cutoff=2,
## plot minimum spanning tree
plot.MST=FALSE, col=c("#F0E442", "#56B4E9"), label.names=c("Sample 1","Sample 2"),
vertex.size=5, edge.width=1,
...)
{
## data input matrices: rows = multivariate dimensions, columns = samples
samp1 <- t(samp1)
samp2 <- t(samp2)
xx <- as.matrix(samp1)
yy <- as.matrix(samp2)
## get sample sizes
m <- max(ncol(xx), 1)
n <- max(ncol(yy), 1)
N <- m+n
##--- OPTION 2: BINARY VALUES ---##
if(binary){
xx <- matrix(as.numeric(xx>binary.cutoff), nrow=nrow(xx))
yy <- matrix(as.numeric(yy>binary.cutoff), nrow=nrow(yy))
}
##---##
## column-wise combined data matrix
dat <- cbind(xx, yy)
##--- OPTION 1: CONSINE DISTANCE ---##
if(use.cosine){
cosmat <- lsa::cosine(dat)
cosmat[is.na(cosmat)] <- -1 #if one point is at the origin, set the -1,0,1??? cosine value
distobj <- stats::as.dist(1-cosmat) #large cosine, small distance; and vise versa
}
##---##
## euclidean distance and MST
else distobj <- stats::dist(t(dat), method="euclidean", diag=T, upper=T)
myMST <- ade4::neig2mat(ade4::mstree(distobj))
## count total runs (i.e. total subgraphs)
bottomleft <- myMST[(m+1):N,1:m]
runs <- sum(bottomleft)+1
## count subgraphs for each sample
bottomright <- myMST[(m+1):N,(m+1):N]
g.samp2 <- igraph::graph_from_adjacency_matrix(bottomright, mode="upper", weighted=NULL, diag=FALSE)
runs.samp2 <- igraph::components(g.samp2)$no
topleft <- myMST[1:m,1:m]
g.samp1 <- igraph::graph_from_adjacency_matrix(topleft, mode="upper", weighted=NULL, diag=FALSE)
runs.samp1 <- igraph::components(g.samp1)$no
## check
# runs==sum(runs.samp1, runs.samp2)
## calculate common nodes
xsum <- colSums(myMST)
C <- sum(xsum*(xsum-1))/2
## calculate mean and variance
mu <- 2*m*n/N + 1
sigma.sq <- (2*m*n/(N*(N-1)))*((2*m*n-N)/N+(C-N+2)*(N*(N-1)-4*m*n+2)/((N-2)*(N-3)))
## the standardized FR-stat and p-value
stat <- (runs-mu)/sqrt(sigma.sq)
p.value <- stats::pnorm(stat)
## output
return(list("statistic"=stat, "p.value"=p.value))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.