Nothing
# SAMGS : original method from Dinu et al - BMC Bioinformatics - 2007
#
# EDIT 17 Sep 2014, Ludwig Geistlinger:
# adapting for use in EnrichmentBrowser package
#
# EDIT 02 Aug 2015, Ludwig Geistlinger
# adapting for use in SAFE framework (local and global stat)
# SAMGS stat as global.stat for safe
global.SAMGS <- function(cmat, u, ...)
{
isAvailable("SparseM", type="software")
am <- getMethod("as.matrix", signature = "matrix.csr")
cmat <- t(am(cmat))
function(u, cmat2=cmat) as.vector(cmat2 %*% u^2)
}
# SAM t-like stat as local.stat for safe
local.t.SAM <- function (X.mat, y.vec, ...)
{
rMV <- function(data)
{
m <- rowMeans(data)
dif <- data - m
ssd <- rowSums(dif^2)
l <- list("means"=m,
"sumsquaredif" =ssd,
"vars" =ssd/(ncol(data)-1),
"centered rows"=dif)
return(l)
}
nb.Groups <- 100
mad.Const <- .64
stopifnot(length(unique(y.vec)) == 2)
if (!all(sort(unique(y.vec)) == c(0,1))) {
warning("y.vec is not (0,1), thus Group 1 == ", y.vec[1])
y.vec <- (y.vec == y.vec[1]) * 1
}
return(function(data, vec = y.vec, ...)
{
C1 <- which(vec==1) # cases
C2 <- which(vec==0) # controls
nb.Genes <- nrow(data)
nb.Samples <- ncol(data)
C1.size <- length(C1)
C2.size <- length(C2)
stat.C1 <- rMV(data[,C1])
stat.C2 <- rMV(data[,C2])
diffmean.C1C2 <- stat.C1$means - stat.C2$means
pooledSqrtVar.C1C2 <- sqrt( (1/C1.size+1/C2.size) *
(stat.C1$sumsquaredif + stat.C2$sumsquaredif) / (nb.Samples-2) )
tmp <- as.data.frame(cbind(pooledSqrtVar.C1C2,diffmean.C1C2))
tmp <- tmp[order(tmp[,1]),]
group.Size <- as.integer(nb.Genes/nb.Groups)
percentiles <- seq(0,1,.05)
nb.Percentiles <- length(percentiles)
s0.quantiles <- quantile(pooledSqrtVar.C1C2,percentiles)
tt <- matrix(NA,nb.Groups,nb.Percentiles)
coeffvar <- as.data.frame(cbind(s0.quantiles,rep(NA,nb.Percentiles)))
group.grid <- seq_len(group.Size*nb.Groups)
denom <- tmp[group.grid,1]
for(j in seq_len(nb.Percentiles))
{
nom <- tmp[group.grid,2] + s0.quantiles[j]
x <- matrix(denom/nom, group.Size, nb.Groups)
tt[,j] <- apply(x, 2, mad, constant=mad.Const)
coeffvar[j,2] <- sd(tt[,j]) / mean(tt[,j])
}
s0 <- min(s0.quantiles[coeffvar[,2] == min(coeffvar[,2])])
TlikeStat <- diffmean.C1C2 / (pooledSqrtVar.C1C2+s0)
return(TlikeStat)
})
}
## START: original code
rowMeansVars <- function(DATA,margin=1)
{
if(margin==2) DATA <- t(DATA)
m <- rowMeans(DATA)
dif <- DATA - m
ssd <- rowSums(dif^2)
list("means"=m,
"sumsquaredif" =ssd,
"vars" =ssd/(ncol(DATA)-1),
"centered rows"=dif )
}
sam.TlikeStat <- function(DATA,
cl=NULL,
s0=NULL,
s0.param=list(nb.Groups=100,mad.Const=.64) )
{
# DATA : expression data
# -> dataframe with rows=genes,
# columns=samples,
# cl : factor defining a bipartition of the samples
# IN THE SAME ORDER AS IN DATA
# NB : use 'C1' + 'C2' XOR 'cl' alone
if(!is.null(cl))
{
C1 <- which(cl==1) # cases
C2 <- which(cl==0) # controls
}
if(is.null(C1) | is.null(C2))
stop("Error - sam.TlikeStat : classes 1 and 2 are undefined.")
nb.Genes <- nrow(DATA)
nb.Samples <- ncol(DATA)
C1.size <- length(C1)
C2.size <- length(C2)
stat.C1 <- rowMeansVars(DATA[,C1])
stat.C2 <- rowMeansVars(DATA[,C2])
diffmean.C1C2 <- stat.C1$means - stat.C2$means
pooledSqrtVar.C1C2 <- sqrt( (1/C1.size+1/C2.size) *
(stat.C1$sumsquaredif + stat.C2$sumsquaredif) / (nb.Samples-2) )
if(is.null(s0)){
nb.Groups <- s0.param$nb.Groups
mad.Const <- s0.param$mad.Const
tmp <- as.data.frame(cbind(pooledSqrtVar.C1C2,diffmean.C1C2))
tmp <- tmp[order(tmp[,1]),]
group.Size <- as.integer(nb.Genes/nb.Groups)
percentiles <- seq(0,1,.05)
nb.Percentiles <- length(percentiles)
s0.quantiles <- quantile(pooledSqrtVar.C1C2,percentiles)
tt <- matrix(NA,nb.Groups,nb.Percentiles)
coeffvar <- as.data.frame(cbind(s0.quantiles,rep(NA,nb.Percentiles)))
for(j in seq_len(nb.Percentiles)){
x <- matrix( tmp[seq_len(group.Size*nb.Groups),1] /
(tmp[seq_len(group.Size*nb.Groups),2] +
s0.quantiles[j]), group.Size, nb.Groups)
tt[,j] <- apply(x, 2, mad, constant=mad.Const)
coeffvar[j,2] <- sd(tt[,j]) / mean(tt[,j])
}
s0 <- min(s0.quantiles[coeffvar[,2] == min(coeffvar[,2])])
}
list(s0 = s0,
diffmean = diffmean.C1C2,
pooledSqrtVar = pooledSqrtVar.C1C2,
TlikeStat = diffmean.C1C2/(pooledSqrtVar.C1C2+s0))
}
# GS.format.dataframe.to.list was called below in the function SAMGS
GS.format.dataframe.to.list <- function(GS)
{
if(is.data.frame(GS)){
genes <- rownames(GS)
L <- NULL
for(ags in names(GS)){
w <- which(GS[,ags]==1)
if(length(w)>0) {
L <- c(L,list(genes[w]))
names(L)[length(L)] <- ags
}
}
L
}else{
GS
}
}
SAMGS <- function(GS,
DATA,
cl,
nbPermutations=1000,
silent=FALSE,
tstat.file=NULL)
{
# GS : gene sets
# -> a dataframe with rows=genes,
# columns= gene sets,
# GS[i,j]=1 if gene i in gene set j
# GS[i,j]=0 otherwise
# OR
# a list with each element corresponding
# to a gene set = a vector of strings (genes identifiers)
#
# DATA : expression data
# -> a dataframe with rows=genes,
# columns=samples
#
# cl : a factor defining a bipartition of the
# samples IN THE SAME ORDER AS IN DATA
#
genes <- rownames(DATA)
GS <- GS.format.dataframe.to.list(GS)
GS <- lapply(GS,function(z) which(genes %in% z))
nb.Samples <- ncol(DATA)
nb.GeneSets <- length(GS) # nb of gene sets
GeneSets.sizes <- sapply(GS,length) # size of each gene set
C1.size <- table(cl)[2] # nb of samples in class 1
# finding constant s0 for SAM-like test
tmp <- sam.TlikeStat(DATA,cl=cl)
s0 <- tmp$s0
# stats obtained on 'true' data
# SAM T-like statistic for each gene
samT.ok <- tmp$TlikeStat
if(!is.null(tstat.file)) save(samT.ok, file=tstat.file)
# SAMGS statitic for each gene set
sam.sumsquareT.ok <- sapply(GS,function(z) sum(samT.ok[z]^2))
# stats obtained on 'permuted' data
permut.C1 <- matrix(NA,nbPermutations,C1.size)
sam.sumsquareT.permut <- matrix(NA,nbPermutations,nb.GeneSets)
cl.permut=cl #initial group values
for(i in seq_len(nbPermutations))
{
C1.permut <- permut.C1[i,] <- sample(nb.Samples,C1.size)
C2.permut <- seq_len(nb.Samples)[-C1.permut]
cl.permut[C1.permut] <- 1
cl.permut[C2.permut] <- 0
samT.permut <- sam.TlikeStat(DATA,cl=cl.permut,s0=s0)$TlikeStat
sam.sumsquareT.permut[i,] <-
sapply(GS,function(z) sum(samT.permut[z]^2))
# SAMGS statitic for each gene set - for current permutation
if(!silent && (i%%50 == 0)) message(paste(i," permutations done."))
}
GeneSets.pval <- apply(
t(sam.sumsquareT.permut) >= sam.sumsquareT.ok, 1, sum) / nbPermutations
#GeneSets.qval <- qvalue::qvalue(GeneSets.pval)$qvalues
#GeneSets.pval <- GeneSets.qval
norm.stat <- sam.sumsquareT.ok / lengths(GS)
res.tbl <- cbind(sam.sumsquareT.ok, norm.stat, GeneSets.pval)
colnames(res.tbl) <- c("SUMSQ.STAT", "NSUMSQ.STAT", configEBrowser("PVAL.COL"))
rownames(res.tbl) <- names(GS)
return(res.tbl)
}
# adapt SAMGS to
## restandardization (Efron and Tibshirani, 2007), and
## non-zero permutation p-values (Phipson and Smith, 2010)
SAMGS2 <- function(GS,
DATA,
cl,
nbPermutations=1000,
silent=FALSE,
tstat.file=NULL,
restand=TRUE,
exact.p=TRUE)
{
# GS : gene sets
# -> a dataframe with rows=genes,
# columns= gene sets,
# GS[i,j]=1 if gene i in gene set j
# GS[i,j]=0 otherwise
# OR
# a list with each element corresponding
# to a gene set = a vector of strings (genes identifiers)
#
# DATA : expression data
# -> a dataframe with rows=genes,
# columns=samples
#
# cl : a factor defining a bipartition of the
# samples IN THE SAME ORDER AS IN DATA
#
genes <- rownames(DATA)
GS <- GS.format.dataframe.to.list(GS)
GS <- lapply(GS,function(z) which(genes %in% z))
nb.Samples <- ncol(DATA)
nb.GeneSets <- length(GS) # nb of gene sets
GeneSets.sizes <- sapply(GS,length) # size of each gene set
C1.size <- table(cl)[2] # nb of samples in class 1
# finding constant s0 for SAM-like test
tmp <- sam.TlikeStat(DATA,cl=cl)
s0 <- tmp$s0
# stats obtained on 'true' data
# SAM T-like statistic for each gene
samT.ok <- tmp$TlikeStat
if(!is.null(tstat.file)) save(samT.ok, file=tstat.file)
######
## modification 1: scale DE t-stat
samT.ok <- scale(samT.ok)
# SAMGS statitic for each gene set
######
## modification 2: mean GS-stat instead sum
sam.sumsquareT.ok <- sapply(GS,function(z) mean(samT.ok[z]^2))
######
## modification 3: mean
mu.samT <- mean(samT.ok)
sd.samT <- sd(samT.ok)
# stats obtained on 'permuted' data
permut.C1 <- matrix(NA,nbPermutations,C1.size)
sam.sumsquareT.permut <- matrix(NA,nbPermutations,nb.GeneSets)
cl.permut=cl #initial group values
for(i in seq_len(nbPermutations))
{
C1.permut <- permut.C1[i,] <- sample(nb.Samples,C1.size)
C2.permut <- seq_len(nb.Samples)[-C1.permut]
cl.permut[C1.permut] <- 1
cl.permut[C2.permut] <- 0
samT.permut <- sam.TlikeStat(DATA,cl=cl.permut,s0=s0)$TlikeStat
sam.sumsquareT.permut[i,] <-
sapply(GS,function(z) sum(samT.permut[z]^2))
# SAMGS statitic for each gene set - for current permutation
if(!silent && (i%%50 == 0)) message(paste(i," permutations done."))
}
### Modification to original code to avoid perm p-values of zero
### (see Phipson and Smith, 2010)
sumf <- ifelse(exact.p, function(x) sum(x) + 1, sum)
if(exact.p) nbPermutations <- nbPermutations + 1
###
GeneSets.pval <- apply(
t(sam.sumsquareT.permut) >= sam.sumsquareT.ok, 1, sumf) / nbPermutations
#GeneSets.qval <- qvalue::qvalue(GeneSets.pval)$qvalues
#GeneSets.pval <- GeneSets.qval
norm.stat <- sam.sumsquareT.ok / lengths(GS)
res.tbl <- cbind(sam.sumsquareT.ok, norm.stat, GeneSets.pval)
colnames(res.tbl) <- c("SUMSQ.STAT", "NSUMSQ.STAT", configEBrowser("PVAL.COL"))
rownames(res.tbl) <- names(GS)
return(res.tbl)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.