R/MAVTgsa.R

Defines functions Hott2 Wilksn MAVTn MAVTp

Documented in Hott2 MAVTn MAVTp Wilksn

######### Hotelling's T^2 statistic using Shrinkage covariance matrix estimates ###############
######### x:data matrix; Row: sample; Column: variables(genes)                  ###############
######### y: vector defining two-group of the samples                           ###############


library(corpcor)

Hott2 <- function(x, y, var.equal=TRUE){
    if(!is.null(y)){
         cl <- as.factor(y)
         lab <- levels(cl)
         ind1 <- which(cl==lab[1])
         ind2 <- which(cl==lab[2])
     }
     if(is.null(ind1) | is.null(ind2))stop("Error: classes 1 and 2 are undefined.")

    data1 <- x[ind1,]
    data2 <- x[ind2,]
    nn <- dim(data1)[1]
    nd <- dim(data2)[1]
    SN <- cov.shrink(data1,verbose=FALSE, lambda.var=0)
    SD <- cov.shrink(data2,verbose=FALSE, lambda.var=0)
    xbar1 <- colMeans(data1)
    xbar2 <- colMeans(data2)
    xdiff <- xbar1 - xbar2

    if (var.equal){
       S <- ((nd - 1) * SD + (nn - 1) * SN)/(nd + nn - 2)
       t2 <- ((nd * nn)/(nd + nn)) * (xdiff %*% solve(S) %*% xdiff)
    }
    else{
       S <- SN/nn + SD/nd
       t2 <- xdiff %*% solve(S) %*% xdiff
    }

    return(as.numeric(t2))
}

######### Ordinary least square(OLS) statistic                                  ###############
######### x:data matrix; Row: sample; Column: variables(genes)                  ###############
######### y: vector defining two-group of the samples  
Tols <- function (x, y ) {
if(!is.null(y)){
         cl <- as.factor(y)
         lab <- levels(cl)
         ind1 <- which(cl==lab[1])
         ind2 <- which(cl==lab[2])
     }
     if(is.null(ind1) | is.null(ind2))stop("Error: classes 1 and 2 are undefined.")
    data1 <- x[ind1,]
    data2 <- x[ind2,]
    nn <- dim(data1)[1]
    nd <- dim(data2)[1]
    #-------------O'Brien's OLS----------------

  olsf <- function(x){
    t.test(x[ind1],x[ind2],var.equal=TRUE)$statistic
  }
 
  t.emat <- apply(x,2,olsf)
  cor.emat <- cor(x)
  tols <- sum(t.emat)/sqrt(sum(cor.emat))
   

    return(abs(as.numeric(tols)))
}




######### Wilks Lambda for n-group comparisons                           ###############
######### Y:data matrix; Row: sample; Column: variables(genes)               ###############
######### Class: vector defining the clinical outcome of the samples         ###############
######### type: type of contrast;                                            ###############
######### base: which group is considered the baseline group for Dunnett contrasts ###############
library(MASS)
library(multcomp)
"design.matrix" <-
function(factors)
{
### Keep the initial order
  n<-length(factors)
  fac<-factor(factors)
  lev<-levels(fac)
  l<-length(lev)
  if(l<2)
    stop("Should have at least two groups")
  X<-matrix(0,n,l)
  for(i in 1:l)
    X[factors==lev[i],i]<-1
  X
}



library(MASS)
ma.estimate <- function (Y, X) {
    ginv(t(X) %*% X) %*% t(X) %*% Y
}



Wilksn <- function(Y, class,type = c("Tukey", "Dunnett", "Sequence"), base=1){
   X <- design.matrix(class)
   hatB <- ma.estimate(Y, X)
   type <- match.arg(type)
   lab <- unique(class)

   E=0
   for(i in 1:length(lab)){
   ind0 <- which(class==lab[i])
   data0<- Y[ind0,]
   n0   <- length(ind0)
   S0   <- cov.shrink(data0,verbose=FALSE)
   E    <- E+(n0-1)*S0
   }

   k<- length(lab)
    names(Y) <- paste("T", 1:k, sep="")
    CM1 <- c()
    CM <- c()
    rnames <- c()
    if (!is.null(names(Y)))
        varnames <- names(Y)
   else varnames <- 1:length(Y)
    kindx <- 1:k
    for (i in 1:(k - 1)) {
            for (j in (i + 1):k) {
                CM1 <- rbind(CM1, as.numeric(kindx == j) - as.numeric(kindx == i))
                }}
   L <- CM1
   H <- t(hatB) %*% t(L) %*% ginv(L %*% ginv(t(X) %*% X) %*% t(L)) %*% L %*% hatB
   temp <- solve(E) %*%H
   stat <- 1/det(temp+diag(1,dim(temp)[1],dim(temp)[1]))            
    switch(type, Dunnett = {
        for (i in kindx[-base]) CM <- rbind(CM, as.numeric(kindx == i) - as.numeric(kindx == base))
        rnames <- paste(varnames[kindx[-base]], "-", varnames[base])
        },
                  Tukey = {
        for (i in 1:(k - 1)) {
            for (j in (i + 1):k) {
                CM <- rbind(CM, as.numeric(kindx == j) - as.numeric(kindx == i))
                rnames <- c(rnames, paste(varnames[j], "-", varnames[i]))
            }}}, 
                  Sequence = {
        for (i in 2:k) {
            CM <- rbind(CM, as.numeric(kindx == i) - as.numeric(kindx == i - 1))
            rnames <- c(rnames, paste(varnames[i], "-", varnames[i - 1]))
        }})


   L <- CM
   for (i in 1:nrow(L)){
   D  <- matrix(0,nrow(L),ncol(L))
   D[i,]=1
   L0  <- D*L
   H0 <- t(hatB) %*% t(L0) %*% ginv(L0 %*% ginv(t(X) %*% X) %*% t(L0)) %*% L0 %*% hatB
   temp0 <- solve(E) %*%H0
   stat0 <- 1/det(temp0+diag(1,dim(temp0)[1],dim(temp0)[1]))
   stat=c(stat, stat0)

   }
   names(stat)=c("ANOVA", rnames)
   return(stat)
}


library(MASS)
library(multcomp)
library(corpcor)

MAVTn <- function(DATA, GS, MCP=1 , alpha=0.01, nbPerm=5000){


 # DATA : expression data with rows=genes, columns=samples
 #        Note that the first row is the group of sample
 #
 # GS : gene sets
 #      -> a data matrix with rows=genes,
 #                        columns= gene sets,
 #                        GS[i,j]=1 if gene i in gene set j
 #                        GS[i,j]=0 otherwise
 #
 #
 # alpha: signinificant level
 #
 #
 # MCP: type of multiple comparison methods;
 #      Dunnett = 1, Tuckey = 2, Sequential Group = 3 
 # 
     cl <- as.numeric(DATA[1,])
     DATA <- DATA[-1,]
     genes <- rownames(DATA)
     k <- length(levels(factor(cl)))
     n.Samples  <- ncol(DATA)
     n.GeneSets <- ncol(GS)
     GeneSets.sizes <- apply(GS,2, function(z) sum(z==1))
     
     if (k>=3) {
     base <- 1
     if (MCP==1){type <- "Dunnett"}
     if (MCP==2){type <- "Tukey"}
     if (MCP==3){type <- "Sequence"}
     if ((1*(MCP==1)+1*(MCP==2)+1*(MCP==3))==0) stop("Error: MCP must be 1, 2 or 3")
     } 

     # observed statitic for each gene set
     if (k==2) {
      stat.ols.obs <- apply(GS, 2, function(z) Tols(t(DATA[which(z==1),]), cl))
      stat.hott.obs <- apply(GS, 2, function(z) Hott2(t(DATA[which(z==1),]), cl))
      }
     if (k>=3) stat.obs <- apply(GS, 2, function(z) Wilksn(t(DATA[which(z==1),]), cl, type , base))

     # stats obtained on 'permuted' data

     stat.ols.permut <- matrix(NA,nbPerm,n.GeneSets)
     stat.hott.permut <- matrix(NA,nbPerm,n.GeneSets)

     if (k>=3){
     stat.temp.permut <- matrix(NA,nbPerm*nrow(stat.obs),n.GeneSets)
     }
      for(i in 1:nbPerm) {
         ind <- sample(n.Samples)
         p.data <- DATA[,ind]
         if (k==2){
           stat.ols.permut[i,] <- apply(GS, 2, function(z) Tols(t(p.data[which(z==1),]), cl))
           stat.hott.permut[i,] <- apply(GS, 2, function(z) Hott2(t(p.data[which(z==1),]), cl))
           }

         if (k>=3){
         k.p <- apply(GS, 2, function(z) Wilksn(t(p.data[which(z==1),]), cl, type , base))
           stat.temp.permut[c(seq(1,nrow(stat.obs)))+nrow(stat.obs)*(i-1),] <- k.p
           }
         }



     if (k==2) {
        GeneSets.ols.pval <- apply(t(stat.ols.permut) >= stat.ols.obs, 1, sum)/nbPerm
        GeneSets.hott.pval <- apply(t(stat.hott.permut) >= stat.hott.obs, 1, sum)/nbPerm
     s.pvalue <- apply(DATA,1, function(z) unlist(summary(aov(z~as.factor(cl))))["Pr(>F)1"])
     nb.ols.sign <- which(GeneSets.ols.pval<=alpha)
     sg.ols.pvalue <- apply(as.matrix(GS[,nb.ols.sign]),2,function(z) c(list("GS size"=sum(z==1),"p-value"=round(s.pvalue[z==1],4))))
     nb.hott.sign <- which(GeneSets.hott.pval<=alpha)
     sg.hott.pvalue <- apply(as.matrix(GS[,nb.hott.sign]),2,function(z) c(list("GS size"=sum(z==1),"p-value"=round(s.pvalue[z==1],4))))
     fwe.ols.pvalue.permut <- matrix(NA,n.GeneSets,nbPerm)
     fwe.hott.pvalue.permut <- matrix(NA,n.GeneSets,nbPerm)
     
     for (i in 1:nbPerm){
         ind.p <- sample(nbPerm) 
         fwe.ols.pvalue.permut[,i] <- apply(t(stat.ols.permut)[,which(ind.p<=ceiling(nbPerm/10))] <= stat.ols.obs, 1, sum)/(nbPerm/10)
         fwe.hott.pvalue.permut[,i] <- apply(t(stat.hott.permut)[,which(ind.p<=ceiling(nbPerm/10))] <= stat.hott.obs, 1, sum)/(nbPerm/10)
        }
     q.ols.ind <- rank(GeneSets.ols.pval,ties.method ="first")
     q.hott.ind <- rank(GeneSets.hott.pval,ties.method ="first") 
     q.ols.val <- minp(fwe.ols.pvalue.permut,q.ols.ind,n.GeneSets,nbPerm)
     q.hott.val <- minp(fwe.hott.pvalue.permut,q.hott.ind,n.GeneSets,nbPerm)

     
     
#------------significant marker--------------------------------

     ols.star <- c()
     hott.star <- c()
     for (i in 1:n.GeneSets){
     if (GeneSets.ols.pval[i]<=alpha) ols.star[i] <- "*"
     else ols.star[i] <- " "

     if (GeneSets.hott.pval[i]<=alpha) hott.star[i] <-"*"
     else hott.star[i] <- " "
     }
     ols.sign <- data.frame(ols.star)
     colnames(ols.sign) <- " "
     hott.sign <- data.frame(hott.star)
     colnames(hott.sign) <- " "    
#-------------adjust p-value-----------------------------------
     ad.ols.pvalue <- round(p.adjust(GeneSets.ols.pval,method="BH"),4)
     fwe.ols.pvalue <- apply(q.ols.val<=GeneSets.ols.pval,1,sum)/nbPerm
     ad.hott.pvalue <- round(p.adjust(GeneSets.hott.pval,method="BH"),4)
     fwe.hott.pvalue <- apply(q.hott.val<=GeneSets.hott.pval,1,sum)/nbPerm
        res <- as.data.frame(cbind("GS size"              = GeneSets.sizes,
                                   "OLS p-value"          = GeneSets.ols.pval,
                                       ols.sign,
                                   "OLS adjusted p-value (FDR)"   = ad.ols.pvalue,
                                   "OLS adjusted p-value (FWE)"   = fwe.ols.pvalue,
                                   "T-square p-value"     = GeneSets.hott.pval,
                                        hott.sign,
                                   "T-square adjusted p-value (FDR)"= ad.hott.pvalue,
                                   "T-square adjusted p-value (FWE)"= fwe.hott.pvalue
                                        ))
     result <- list("p value"=res,"Singinificant gene set (one-sided)"=sg.ols.pvalue,"Singinificant gene set (two-sided)"=sg.hott.pvalue)
 #----------------gsaplot---------------------------------------   
     p.sort.ols <- sort(GeneSets.ols.pval)
     p.sort.hott <- sort(GeneSets.hott.pval)
     plot(p.sort.ols,type="l",xaxs = "i",  
        xlab = "Ranked p-value", ylab = "p-value", main = "")
     par(new=TRUE)
     plot(p.sort.hott,type="l",axes = FALSE, col="red", xaxs = "i",  
        xlab = "", ylab = "", main = "")    
     #lines( par()$usr[1:2], par()$usr[3:4] , lty=2)
     abline(a=0,b=1/n.GeneSets,lty=2)
     leg.names <- c("OLS",expression("Hotelling's "*T^2))
     legend(locator(1),leg.names,lty=c(1,1),col=c(1,2))
     }
     
     
     
     if (k>=3){
         stat.temp.pval <- matrix(NA,nrow(stat.obs),n.GeneSets)
         for (i in 1:nrow(stat.obs)){
         t.pval=apply(t(stat.temp.permut[seq(i,nrow(stat.temp.permut),nrow(stat.obs)),])<= stat.obs[i,],1,sum)/nbPerm
           stat.temp.pval[i,] <- t.pval
           }
           fwe.ad.pval.permut <- matrix(NA,n.GeneSets,nbPerm)
           stat.permut.obs <- t(stat.temp.permut[seq(1,nrow(stat.temp.permut),nrow(stat.obs)),])
     for (i in 1:nbPerm){
         ind.p <- sample(nbPerm) 
         fwe.ad.pval.permut[,i] <- apply(stat.permut.obs[,which(ind.p<=ceiling(nbPerm/10))] <= stat.obs[1,], 1, sum)/(nbPerm/10)
        }
     q.mon.ind <- rank(t(stat.temp.pval)[,1],ties.method ="first") 
     q.ad.val <- minp(fwe.ad.pval.permut,q.mon.ind,n.GeneSets,nbPerm)
           
     r1<-(1:(k*(k-1)/2))
     r2<-NULL
     r3<-NULL
     for(i in 1:k){
     r2<-c(r2,rep(i,k-i))
     if (i+1<=k)
     r3<-c(r3,((i+1):k))
     }
     names=paste(rownames(k.p)," p-value",sep="")
     dimnames(stat.temp.pval) <- list(names)
     s.pvalue <- apply(DATA,1, function(z) unlist(summary(aov(z~as.factor(cl))))["Pr(>F)1"])
     nb.sign <- which(stat.temp.pval[1,]<=alpha)
     sg.pvalue <- apply(as.matrix(GS[,nb.sign]),2,function(z) c(list("GS size"=sum(z==1),"p-value"=round(s.pvalue[z==1],4))))
     
     star <- c()
     for (i in 1:n.GeneSets){
     if (stat.temp.pval[1,i]<=alpha) star[i] <- "*"
     else star[i] <- " "
     }
     sign <- data.frame(star)
     ad.pval <- round(p.adjust(t(stat.temp.pval)[,1],method="BH"),4)
     fwe.ad.pval <- 1- apply(q.ad.val<=t(stat.temp.pval)[,1],1,sum)/nbPerm
     colnames(sign) <- " "
     res <- as.data.frame(cbind("GS size"              = GeneSets.sizes,
                                "MANOVA p-value"   =t(stat.temp.pval)[,1],
                                sign,
                                "adjusted p-value (FDR)"   = ad.pval,
                                "adjusted p-value (FWE)"   = fwe.ad.pval,
                                t(stat.temp.pval)[,-1]     ))
   
     rownames(res)=colnames(GS)
     result <- list("p value"=res, "singinificant gene set"=sg.pvalue)
     p.sort.manova <- sort(t(stat.temp.pval)[,1])

     plot(p.sort.manova,type="l",xaxs = "i", 
        xlab = "Ranked p-value", ylab = "MANOVA p-value", main = "")
     #lines( par()$usr[1:2], par()$usr[3:4] , lty=2)
     abline(0,1/n.GeneSets,lty=2)
     }


     
     
     
   
    return(result)
}

#------------gstplot-------------------------
GSTplot <- function (data,gs,geneset.name=NULL,alpha=0.01){
     ccl <- as.numeric(data[1,])
     k <- length(levels(factor(ccl)))
     group <- c(1,2)
if (k==2){

if (is.null(geneset.name)||sum(1*(colnames(gs)==geneset.name))==0) {
     stop("Gene set name is unmatched", 
     call. = FALSE)
    }
    GS=gs[,which(colnames(gs)==geneset.name)]
    
    if (!is.null(data)){
         cl <- as.factor(group)
         lab <- levels(cl)
         ind1 <- which(data[1,]==lab[1])
         ind2 <- which(data[1,]==lab[2])
     }
    g=as.matrix(2:dim(data)[[1]])
    t.stat.two <- apply(g, 1, function(z) t.test(data[z,ind2],data[z,ind1],)$statistic )
    names(t.stat.two)=row.names(data[-1,])
    t.stat.sorted <- sort(t.stat.two)
    t.stat.order <- order(t.stat.two)
    gene.names.sorted <- names(t.stat.sorted)
    
aa=rt(length(t.stat.sorted),dim(data)[[2]]-2)   
m <- dim(data[-1,])[[1]]
ings <- which(GS==1)
stopa1 <- numeric(length(ings))
for (i in 1:length(ings)){
stopa1[i] <- which(t.stat.order==ings[i])
}
stopa <- sort(stopa1)
stopb <- stopa
    for (i in 2:(length(stopa)-1)){
    if (stopa[i]<=m/2) {
        if  (stopa[i]-max(stopa[i-1],stopb[i-1])<130)
        stopb[i] <- max(stopa[i],stopb[i-1])+130
        else stopb[i] <- stopa[i]
        }
    if (stopa[i]>m/2)   {
        if  (min(stopa[i+1],(stopb[i+1]-260)-stopa[i])<130) 
        stopb[i] <- min(stopa[i],stopb[i+1]-260)-130
        else stopb[i] <- stopa[i]
        }
    }
stopb[length(stopa)] <- stopa[length(stopa)]-130    
    nf <- layout(matrix(c(1, 2), 2, 1, byrow = TRUE), c(4, 4), 
        c(1, 3), TRUE)
    par(adj = 0.5, plt = c(0.2, 0.9, 0, 0.8))
    plot(1:m, rep(0,m), ylim = c(0.04,1), xaxs = "i", axes = FALSE, 
        xlab ="", ylab ="", main = "", col = 0)
        
    for (i in 1:length(stopa))  {
text(stopb[i]-200,0.3,gene.names.sorted[stopa[i]],srt = 90, cex = 0.5, font = 1, adj=-1, pos=4)     
lines(c(stopa[i],stopa[i],stopb[i]),c(0,0.15,0.25),type="l",col=1)
}

    par(adj = 0.5, plt = c(0.2, 0.9, 0.65, 1))
    plot(c(0, m), c(-6, 6), xlim = c(1, dim(data[-1])[[1]]), xaxs = "i", yaxs = "i", 
        xlab = "Ranked t-statistics", ylab = "Observed t-statistics", main = "", col = 0)
        text(m/25, 3.87, geneset.name, cex = 0.8, font = 2, adj = 0)
        polygon(c(0, m*alpha, m*alpha, 0), 
            c(-6, -6, 10, 10), border = FALSE, col = 8)
        polygon(c( m*(1-alpha),m, m,m*(1-alpha)), 
            c(-6, -6, 10, 10), border = FALSE, col = 8)

        lines(sort(t.stat.sorted),type="l",col=2)
        for (i in 1:length(stopa)){
        lines(c(stopa[i],stopa[i]),c(0,sort(t.stat.two[which(GS==1)])[i]),type="l",col=2)
        }
        
        abline(h=0,lty=2)
        #lines( par()$usr[1:2], par()$usr[3:4] , lty=2) 
        
par(new=TRUE)
Fnt <- ecdf(sort(t.stat.two[which(GS==1)]))
ecdf.y <- Fnt(knots(Fnt))
plot(stopa,ecdf.y, type="s",xlim = c(1, dim(data[-1])[[1]]),xaxs = "i",axes=FALSE,xlab="",ylab="")

}


#---------------plot for MANOVA---------------------------------------

if (k>=3)
{
if (is.null(geneset.name)||sum(1*(colnames(gs)==geneset.name))==0) {
     stop("Gene set name is unmatched", 
     call. = FALSE)
    }
    GS=gs[,which(colnames(gs)==geneset.name)]
    cl <- as.numeric(data[1,])
    df1 <- length(levels(factor(cl)))-1
    df2 <- length(cl)-length(levels(factor(cl)))    
    g=as.matrix(2:dim(data)[[1]])
    F.sata <- apply(g, 1, function(z) unlist(summary(aov(data[z,]~as.factor(cl))))["F value1"] )
    names(F.sata)=row.names(data[-1,])
    t.stat.sorted <- sort(F.sata)
    t.stat.order <- order(F.sata)
    gene.names.sorted <- names(t.stat.sorted)
    lables.sig <- qf(1-alpha,df1,df2)
    lowb <- (dim(data)[1]-1)*(1-alpha)
aa=rt(length(t.stat.sorted),dim(data)[[2]]-2)   
m <- dim(data[-1,])[[1]]
maxf <- max(F.sata)
ings <- which(GS==1)
stopa1 <- numeric(length(ings))
for (i in 1:length(ings)){
stopa1[i] <- which(t.stat.order==ings[i])
}
stopa <- sort(stopa1)
stopb.big <- sort(which(stopa>=lowb),decreasing=TRUE)
stopb.i <- max(stopa)-0.01*m
if (length(stopb.big)>1) {
    for (i in 2:length(stopb.big)){
        if  (stopb.i[i-1]-stopa[stopb.big[i]]>m*0.013) 
        stopb.i <- c(stopb.i,stopa[stopb.big[i]])
        else stopb.i <- c(stopb.i,stopb.i[i-1]-m*0.013)
    }
    stopb <- sort(stopb.i)}
    
     nf <- layout(matrix(c(1, 2), 2, 1, byrow = TRUE), c(4, 4), 
        c(1, 3), TRUE)
    par(adj = 0.5, plt = c(0.2, 0.9, 0, 0.8))
    plot(1:m, rep(0,m), ylim = c(0.04,1), xaxs = "i", axes = FALSE, 
        xlab ="", ylab ="", main = "", col = 0)
        
    for (i in 1:length(stopa))  {
    
lines(c(stopa[i],stopa[i]),c(0,0.15),type="l",col=1)
}
for (i in 1:length(stopb.i))    {

lines(c(sort(stopa,decreasing=TRUE)[i],stopb.i[i]),c(0.15,0.25),type="l",col=1)
}


name.sig <- which(stopa>m*(1-alpha))
for (i in 1:length(stopb.i)){
text(stopb.i[i]-m*0.026,0.27,gene.names.sorted[sort(stopa,decreasing=TRUE)[i]],srt = 90, cex = 0.5, font = 2, adj=-1, pos=4)    
}

    par(adj = 0.5, plt = c(0.2, 0.9, 0.65, 1))
    plot(c(0, m), c(0, maxf), xlim = c(1, m), xaxs = "i", yaxs = "i", 
        xlab = "Ranked F-statistics", ylab = "Observed F-statistics", main = "", col = 0)
        text(m/25, maxf-5, geneset.name, cex = 0.8, font = 2, adj = 0)
        polygon(c( lowb,m, m,lowb), 
            c(0, 0, maxf, maxf), border = FALSE, col = 8)

        lines(sort(t.stat.sorted),type="l",col=2)
        for (i in 1:length(stopa)){
        lines(c(stopa[i],stopa[i]),c(0,sort(F.sata[which(GS==1)])[i]),type="l",col=2)
        }
        
        abline(h=0,lty=2)
        #lines( par()$usr[1:2], par()$usr[3:4] , lty=2) 
        
par(new=TRUE)
Fnt <- ecdf(sort(F.sata[which(GS==1)]))
ecdf.y <- Fnt(knots(Fnt))
plot(c(0,stopa),c(0,ecdf.y), type="s",xlim = c(1, m),xaxs = "i",axes=FALSE,xlab="",ylab="")

}
}

#--------------minp----------------
minp <- function (p,rank,n.GeneSets,nbPerm){
     q.val <- matrix(NA,n.GeneSets,nbPerm)
     q.val.ra <- matrix(NA,n.GeneSets,nbPerm)
     q.val.ra[n.GeneSets,] <- p[which.max(rank),]
     for (j in (n.GeneSets-1):1){
         q.val.ra[j,] <- pmin(q.val.ra[(j+1),],p[which(rank==j),])
        }
     for (i in n.GeneSets:1) {
         q.val[which(rank==i),] <- q.val.ra[i,] 
        }
     return(q.val)
    }
    
#library(MCMCpack)
library(randomForest)
library(foreach)
library(corpcor)
#library(doParallel)


############### Random Forest ################
MAVTp <- function(DATA, GS, nbPerm=5000, numoftree=500,type=c("cont","cate"),impt=TRUE){

     cl <- as.numeric(DATA[1,])
     DATA_1 <- DATA[-1,]
     p.value=rep(0,dim(GS)[2])
     temp <- "importance measure"
     for (i in 1:dim(GS)[2]){
     ind_gs <- which(GS[,i]==1)
     DATA <- DATA_1[ind_gs,]

if (type=="cont"){
################  continuous group  ################


group <- as.numeric(cl)
nb.Samples  <- ncol(DATA)
test.rf <- randomForest(t(DATA),group, ntree=numoftree, importance=TRUE, proximity=FALSE, mtry=5)
value=round(test.rf$mse[numoftree],4)



nullV<-foreach(j=1:nbPerm, .combine=cbind) %do%{
ind=sample(1:length(group));
testp.rf <- randomForest(t(DATA)[ind,], group, ntree=numoftree, importance=FALSE, proximity=FALSE,mtry=5);
round(testp.rf$mse[numoftree],4)
}

Pvalue.rf=sum(nullV<=value)/nbPerm
#RF_pvalue=Pvalue.rf
p.value[i]<- Pvalue.rf
  temp <- list(temp, Imp=importance(test.rf))
}
################  categorical data  ################
if (type=="cate"){

group <- as.factor(cl)
  test.rf <- randomForest(t(DATA),group, ntree=numoftree, importance=TRUE, proximity=FALSE)
  value=round(test.rf$err.rate[numoftree,1],4)

  nullV<-foreach(j=1:nbPerm, .combine=cbind) %do%{
    ind=sample(1:length(group))
    testp.rf <- randomForest(t(DATA)[ind,], group, ntree=numoftree, importance=FALSE, proximity=FALSE);
    round(testp.rf$err.rate[numoftree,1],4)
  }

  Pvalue=sum(nullV<=value)/length(nullV)
  #RF_pvalue=Pvalue
  p.value[i]<- Pvalue
  temp <- list(temp, Imp=importance(test.rf)[,3:4])
}



}


if (impt==TRUE) result <- list("p.value"=p.value, "importance"=temp)
else result <- list(p.value)
return(result)
}

Try the MAVTgsa package in your browser

Any scripts or data that you put into this service are public.

MAVTgsa documentation built on May 2, 2019, 6:37 a.m.