# R/num.sv.R In sva: Surrogate Variable Analysis

#### Documented in num.sv

#' A function for calculating the number of surrogate variables to estimate in a model
#'
#' This function estimates the number of surrogate variables that should be included
#' in a differential expression model. The default approach is based on a permutation
#' procedure originally prooposed by Buja and Eyuboglu 1992. The function also provides
#' an interface to the asymptotic approach proposed by Leek 2011 Biometrics.
#'
#' @param dat The transformed data matrix with the variables in rows and samples in columns
#' @param mod The model matrix being used to fit the data
#' @param method One of "be" or "leek" as described in the details section
#' @param vfilter You may choose to filter to the vfilter most variable rows before performing the analysis
#' @param B The number of permutaitons to use if method = "be"
#' @param seed Set a seed when using the permutation approach
#'
#' @return n.sv The number of surrogate variables to use in the sva software
#'
#' @examples
#'
#' pheno = pData(dat)
#' edata = exprs(dat)
#' mod = model.matrix(~as.factor(cancer), data=pheno)
#'
#' n.sv = num.sv(edata,mod,method="leek")
#'
#' @export
#'
num.sv <- function(dat, mod,method=c("be","leek"),vfilter=NULL,B=20,seed=NULL) {
if(!is.null(vfilter)){
if(vfilter < 100 | vfilter > dim(dat)[1]){
stop(paste("The number of genes used in the analysis must be between 100 and",dim(dat)[1],"\n"))
}
tmpv = rowVars(dat)
ind = which(rank(-tmpv) < vfilter)
dat = dat[ind,]
}

method <- match.arg(method)
if(method=="be"){
if(!is.null(seed)){set.seed(seed)}
warn <- NULL
n <- ncol(dat)
m <- nrow(dat)
H <- mod %*% solve(t(mod) %*% mod) %*% t(mod)
res <- dat - t(H %*% t(dat))
uu <- svd(res)
ndf <- min(m,n) - ceiling(sum(diag(H)))
dstat <- uu\$d[1:ndf]^2/sum(uu\$d[1:ndf]^2)
dstat0 <- matrix(0,nrow=B,ncol=ndf)

for(i in 1:B){
res0 <- t(apply(res, 1, sample, replace=FALSE))
res0 <- res0 - t(H %*% t(res0))
uu0 <- svd(res0)
dstat0[i,] <- uu0\$d[1:ndf]^2/sum(uu0\$d[1:ndf]^2)
}
psv <- rep(1,n)
for(i in 1:ndf){
psv[i] <- mean(dstat0[,i] >= dstat[i])
}
for(i in 2:ndf){
psv[i] <- max(psv[(i-1)],psv[i])
}

nsv <- sum(psv <= 0.10)
return(as.numeric(list(n.sv = nsv)))
}else{
dat <- as.matrix(dat)
dims <- dim(dat)
a <- seq(0,2,length=100)
n <- floor(dims[1]/10)
rhat <- matrix(0,nrow=100,ncol=10)
P <- (diag(dims[2])-mod %*% solve(t(mod) %*% mod) %*% t(mod))
for(j in 1:10){
dats <- dat[1:(j*n),]
ee <- eigen(t(dats) %*% dats)
sigbar <- ee\$values[dims[2]]/(j*n)
R <- dats %*% P
wm <- (1/(j*n))*t(R) %*% R - P*sigbar
ee <- eigen(wm)
v <- c(rep(TRUE, 100), rep(FALSE, dims[2]))
v <- v[order(c(a*(j*n)^(-1/3)*dims[2],ee\$values), decreasing = TRUE)]
u <- 1:length(v)
w <- 1:100
rhat[,j] <- rev((u[v==TRUE]-w))
}
ss <- rowVars(rhat)

bumpstart <- which.max(ss > (2*ss[1]))
start <- which.max(c(rep(1e5,bumpstart),ss[(bumpstart+1):100]) < 0.5*ss[1])
finish <- which.max(ss*c(rep(0,start),rep(1,100-start)) > ss[1])
if(finish==1){finish <- 100}

n.sv <- modefunc(rhat[start:finish,10])
return(n.sv)
print(method)
}
}

## Try the sva package in your browser

Any scripts or data that you put into this service are public.

sva documentation built on Nov. 8, 2020, 8:16 p.m.