R/mGSZ.R

Defines functions mGSZ rm.rows.with.noSetMembers rm.small.genesets sumVarMean_calc count.prob.sum count_hyge_var_mean calc_z_var geneSetsList

Documented in calc_z_var count_hyge_var_mean count.prob.sum geneSetsList mGSZ rm.rows.with.noSetMembers rm.small.genesets sumVarMean_calc

# Liisa Holm's Bioinformatics Group, Institute of Biotechnology, University of Helsinki, Finland.
# This software is supplied without any warranty or guarantee.
# Contact: pashupati.mishra@helsinki.fi, petri.toronen@.helsinki.fi

# mGSZ: Modified Gene Set Z-score method for gene set analysis.




mGSZ <- function(expr.data, gene.sets, sample.labels, flip.gene.sets = FALSE, select= 'T-score', is.log = TRUE, gene.perm = FALSE,  min.cl.sz=5, other.methods = FALSE, pre.var = 0, wgt1 = 0.2, wgt2 = 0.5, var.constant = 10, start.val=5, perm.number = 200){

# This is a wrapper function that calculates gene scores (t-scores/fold change), gene set scores and asymptotic p-values for the gene set scores.
# In addition to mGSZ, the function calculates gene set scores with seven other methods (visit "website.fi" for details)

# Arguments:
# expr.data: Gene expression data in matrix form (rows denote genes, columns denote samples)
# gene.sets: Gene set data (list or matrix)
# sample.labels: Vector of response values (example: 1,2)
# perm.number: Number of permutations for p-value calculation
# flip.gene.sets: TRUE if gene.sets data is in list structure and genes (or gene ids) are names of list
# select: Gene score (T-score, P-value or Fold Change(FC))
# is.log: TRUE for gene score to be log Fold Change
# gene.perm: TRUE for p-value calculation with gene permutation in addition to sample permutation
# min.cl.sz: Minimum size of a gene set to be included in the analysis
# other.methods: TRUE for gene set analysis with the other seven methods (visit "website.fi" for details)
# wgt1: weight 1, parameter used to weigth the prior variance obtained with class size var_const. This penalizes especially small classes and small subsets for them, default is 0.2. Values around 0.1 - 0.5 are expected to be reasonable
# wgt2: weight 2 parameter used to weight the prior variance obtained with the same class size as that of the analyzed class. This penalizes especially small subsets from the gene list. Default is 0.5. Values around 0.3 and 0.5 have looked OK.
# var_const: size of the reference class used with wgt1. Default is 10.
# pre.var: estimate of the variance associated with each observation, BEFORE the analysis represented (uncertainty of the expression values). This can be used in the calculation of the total variance. This was not used in the article
# start.val: Number of first genes to be left out



   ########################################
   ## Testing that data sizes are correct #
   ########################################
   
   expr.dat.sz <- dim(expr.data)
   gene.sets <- toMatrix(expr.data,gene.sets,flip.gene.sets)
   num.genes <- nrow(gene.sets)
   
   if(!(num.genes == expr.dat.sz[1]) ){
      stop("Number of genes in gene set data and expression data do not match")
   }
   	
    ##Remove rows with no gene set members 
    
	data <- rm.rows.with.noSetMembers(expr.data,gene.sets) 
	expr.data <- data$expr.data
	gene.sets <- data$gene.sets
    
    ##Remove too small gene sets
    
    gene.sets <- rm.small.genesets(gene.sets,min.cl.sz)
    geneset.classes <- colnames(gene.sets)
    num.classes <- dim(gene.sets)[2]
    num.genes <- dim(expr.data)[1]
    geneset.class.sizes <- colSums(gene.sets)
   
    
    ##Calculation of log fold change summary for each gene set
    
    setsFC <- setSummary(expr.data, gene.sets, sample.labels)
    
    
    ##Calculation of gene scores 


    if(select=="FC"){
        tmp.expr.data = diff.FC.perm(expr.data, sample.labels, perm.number)
    	}
    else
        tmp = diff_c_perm(expr.data, sample.labels, perm.number)
            if(select=="T-score"){
                tmp.expr.data = tmp$t_scores}
            if(select=="P-value"){
                tmp.expr.data=tmp$p_values}
		 
    diff.expr.dat.sz <- dim(tmp.expr.data)
	
	
	## mGSZ score for positive data ####
	
    
	pos.scores <- mGSZ.test.score2(expr.data=tmp.expr.data[,1], gene.sets, wgt1, wgt2, pre.var, var.constant, start.val)
    
	pos.mGSZ.scores <- pos.scores$gene.set.scores$mGSZ.scores

  
    ## Gene set scores for positive data with other methods ##
    
	if(other.methods == TRUE){
		pos.mAllez.scores <- pos.scores$gene.set.scores$mAllez.scores
		pos.mGSA.scores <- pos.scores$gene.set.scores$mGSA.scores
        pos.ss.scores <- SS.test.score(tmp.expr.data[,1], gene.sets)
        pos.sum.scores <- sum.test.score(tmp.expr.data[,1], gene.sets)
        pos.wrs.scores <- WRS.test.score(tmp.expr.data[,1], gene.sets)
        pos.ks.scores <- KS.score(tmp.expr.data[,1], gene.sets)$KS.org
        pos.wks.scores <- KS.score(tmp.expr.data[,1], gene.sets)$KS.mod
        pos.ks.up <- KS.score(tmp.expr.data[,1], gene.sets)$profile2

    }
	 
	
	## mGSZ for sample permutation data ###
	
	
	col.perm.mGSZ <- matrix(0, diff.expr.dat.sz[2]-1, num.classes)
	col.perm.mAllez <- col.perm.mGSZ
	col.perm.mGSA <- col.perm.mGSZ
    col.perm.WRS <- col.perm.mGSZ
    col.perm.SS <- col.perm.mGSZ
    col.perm.SUM <- col.perm.mGSZ
    col.ks <- col.perm.mGSZ
    col.wks <- col.perm.mGSZ
    

    for( k in 1:(diff.expr.dat.sz[2]-1)){
    	tmp <- mGSZ.test.score(tmp.expr.data[,k+1], gene.sets,wgt1, wgt2, pre.var, var.constant, start.val)
    	col.perm.mGSZ[k,]<-tmp$mGSZ.scores

        if(other.methods==TRUE){
    		col.perm.mAllez[k,] <- tmp$mAllez.scores
    		col.perm.mGSA[k,] <- tmp$mGSA.scores
            col.ks[k,] <- KS.score(tmp.expr.data[,k+1], gene.sets)$KS.org
            col.wks[k,] <- KS.score(tmp.expr.data[,k+1], gene.sets)$KS.mod
            col.perm.WRS[k,] <- WRS.test.score(tmp.expr.data[,k+1], gene.sets)
            col.perm.SS[k,] <- SS.test.score(tmp.expr.data[,k+1], gene.sets)
            col.perm.SUM[k,] <- sum.test.score(tmp.expr.data[,k+1], gene.sets)
        }
     }
     
     


### p-value calculation for the gene set scores with sample permutation ###

    mGSZ.p.vals.col.perm <- mGSZ.p.values(pos.mGSZ.scores,col.perm.mGSZ)

    if(other.methods==TRUE){   
        wrs.p.vals.col <- WRS.p.values(pos.wrs.scores,col.perm.WRS)
        ss.p.vals.col <- SS.p.values(pos.ss.scores,col.perm.SS)
        sum.p.vals.col <- mAllez.p.values(pos.sum.scores,col.perm.SUM)
        KS.p.vals.col <- KS.p.values(pos.ks.scores,col.ks)
        wKS.p.vals.col <- KS.p.values(pos.wks.scores,col.wks)
        mGSA.p.vals.col.perm <- mGSA.p.values(pos.mGSA.scores,col.perm.mGSA)
        mAllez.p.vals.col.perm <- mAllez.p.values(pos.mAllez.scores,col.perm.mAllez)
    }



### preparing output table for each of the methods with column permutation ###


    mGSZ.table.col <- data.frame(gene.sets = colnames(gene.sets)[mGSZ.p.vals.col.perm$class.ind], set.size = geneset.class.sizes[mGSZ.p.vals.col.perm$class.ind],gene.set.scores=pos.mGSZ.scores[mGSZ.p.vals.col.perm$class.ind],pvalue=mGSZ.p.vals.col.perm$EV.class, avg.logFC = setsFC[1,mGSZ.p.vals.col.perm$class.ind], avg.abs.logFC = setsFC[2,mGSZ.p.vals.col.perm$class.ind],row.names=NULL)



    if(other.methods==TRUE){
        mGSA.table.col <- data.frame(gene.sets = colnames(gene.sets)[mGSA.p.vals.col.perm$class.ind],set.size = geneset.class.sizes[mGSA.p.vals.col.perm$class.ind],gene.set.scores=pos.mGSA.scores[mGSA.p.vals.col.perm$class.ind],pvalue=mGSA.p.vals.col.perm$EV.class, avg.logFC = setsFC[1,mGSA.p.vals.col.perm$class.ind], avg.abs.logFC = setsFC[2,mGSA.p.vals.col.perm$class.ind],row.names=NULL)
        
   
        mAllez.table.col <- data.frame(gene.sets = colnames(gene.sets)[mAllez.p.vals.col.perm$class.ind],set.size = geneset.class.sizes[mAllez.p.vals.col.perm$class.ind],gene.set.scores=pos.mAllez.scores[mAllez.p.vals.col.perm$class.ind],pvalue=mAllez.p.vals.col.perm$NORM.class, avg.logFC = setsFC[1,mAllez.p.vals.col.perm$class.ind], avg.abs.logFC = setsFC[2,mAllez.p.vals.col.perm$class.ind],row.names=NULL)
   
        WRS.table.col <- data.frame(gene.sets = colnames(gene.sets)[wrs.p.vals.col$class.ind],set.size = geneset.class.sizes[wrs.p.vals.col$class.ind],gene.set.scores=pos.wrs.scores[wrs.p.vals.col$class.ind],pvalue=wrs.p.vals.col$EMP.class, avg.logFC = setsFC[1,wrs.p.vals.col$class.ind], avg.abs.logFC = setsFC[2,wrs.p.vals.col$class.ind],row.names=NULL)
    
        SUM.table.col <- data.frame(gene.sets = colnames(gene.sets)[sum.p.vals.col$class.ind],set.size = geneset.class.sizes[sum.p.vals.col$class.ind],gene.set.scores=pos.sum.scores[sum.p.vals.col$class.ind],pvalue=sum.p.vals.col$NORM.class, avg.logFC = setsFC[1,sum.p.vals.col$class.ind], avg.abs.logFC = setsFC[2,sum.p.vals.col$class.ind],row.names=NULL)
   
        SS.table.col <- data.frame(gene.sets = colnames(gene.sets)[ss.p.vals.col$class.ind],set.size = geneset.class.sizes[ss.p.vals.col$class.ind],gene.set.scores=pos.ss.scores[ss.p.vals.col$class.ind],pvalue=ss.p.vals.col$EMP.class, avg.logFC = setsFC[1,ss.p.vals.col$class.ind], avg.abs.logFC = setsFC[2,ss.p.vals.col$class.ind],row.names=NULL)
 
        Old.KS.table.col <- data.frame(gene.sets = colnames(gene.sets)[KS.p.vals.col$class.ind],set.size = geneset.class.sizes[KS.p.vals.col$class.ind],gene.set.scores=pos.ks.scores[KS.p.vals.col$class.ind],pvalue =KS.p.vals.col$EMP.class, avg.logFC = setsFC[1,KS.p.vals.col$class.ind], avg.abs.logFC = setsFC[2,KS.p.vals.col$class.ind],row.names=NULL)
    
        New.KS.table.col <- data.frame(gene.sets = colnames(gene.sets)[wKS.p.vals.col$class.ind],set.size = geneset.class.sizes[wKS.p.vals.col$class.ind],gene.set.scores=pos.wks.scores[wKS.p.vals.col$class.ind],pvalue=wKS.p.vals.col$EMP.class, avg.logFC = setsFC[1,wKS.p.vals.col$class.ind], avg.abs.logFC = setsFC[2,wKS.p.vals.col$class.ind],row.names=NULL)
    }
    
    
### Ordering of the tables

    mGSZ.table.col <- mGSZ.table.col[order(mGSZ.table.col$pvalue,decreasing=FALSE),]
    if(other.methods==TRUE){
        mGSA.table.col <- mGSA.table.col[order(mGSA.table.col$pvalue,decreasing=FALSE),]
        mAllez.table.col <- mAllez.table.col[order(mAllez.table.col$pvalue,decreasing=FALSE),]
        WRS.table.col <- WRS.table.col[order(WRS.table.col$pvalue,decreasing=FALSE),]
        SUM.table.col <- SUM.table.col[order(SUM.table.col$pvalue,decreasing=FALSE),]
        SS.table.col <- SS.table.col[order(SS.table.col$pvalue,decreasing=FALSE),]
        Old.KS.table.col <- Old.KS.table.col[order(Old.KS.table.col$pvalue,decreasing=FALSE),]
        New.KS.table.col <- New.KS.table.col[order(New.KS.table.col$pvalue,decreasing=FALSE),]
    }


### calculation of p-value with gene (row) permutation  
    if(gene.perm==TRUE){
        row.perm.mGSZ <- matrix(0, perm.number, num.classes)
        row.perm.mAllez <- row.perm.mGSZ
        row.perm.mGSA <- row.perm.mGSZ
        row.perm.WRS <- row.perm.mGSZ
        row.perm.SS <- row.perm.mGSZ
        row.perm.SUM <- row.perm.mGSZ
        row.ks <- row.perm.mGSZ
        row.wks <- row.perm.mGSZ
        row.permuted.data <- replicate(perm.number,sample(tmp.expr.data[,1], num.genes, replace=FALSE))
        unique.perm <- unique(t(row.permuted.data))
        if(dim(unique.perm)[1]<perm.number){
        #print("Number of unique permutations:" dim(unique.perm)[1])
            row.permuted.data <- t(unique.perm)
        }
        for(i in 1:perm.number){
            res <- mGSZ.test.score4(row.permuted.data[,i],gene.sets,Z_var1=pos.scores$var.attributes$Z_var1, Z_var2=pos.scores$var.attributes$Z_var2, Z_mean1=pos.scores$var.attributes$Z_mean1, Z_mean2=pos.scores$var.attributes$Z_mean2,class_size_index1=pos.scores$var.attributes$class_size_index1, class_size_index2=pos.scores$var.attributes$class_size_index2,start.val)
            row.perm.mGSZ[i,] <- res$mGSZ.scores
            if(other.methods==TRUE){
                row.perm.mAllez[i,] <- res$mAllez.scores
                row.perm.mGSA[i,] <- res$mGSA.scores
                row.ks[k,] <- KS.score(row.permuted.data[,i], gene.sets)$KS.org
                row.wks[k,] <- KS.score(row.permuted.data[,i], gene.sets)$KS.mod
                row.perm.WRS[k,] <- WRS.test.score(row.permuted.data[,i], gene.sets)
                row.perm.SS[k,] <- SS.test.score(row.permuted.data[,i], gene.sets)
                row.perm.SUM[k,] <- sum.test.score(row.permuted.data[,i], gene.sets)
            }

		}
	

    ### p-value calculation for the gene set scores with gene (row) permutation ###

        mGSZ.p.vals.row.perm <- mGSZ.p.values(pos.mGSZ.scores,row.perm.mGSZ)
        if(other.methods==TRUE){
            wrs.p.vals.row <- WRS.p.values(pos.wrs.scores,row.perm.WRS)
            ss.p.vals.row <- SS.p.values(pos.ss.scores,row.perm.SS)
            sum.p.vals.row <- mAllez.p.values(pos.sum.scores,row.perm.SUM)
            KS.p.vals.row <- KS.p.values(pos.ks.scores,row.ks)
            wKS.p.vals.row <- KS.p.values(pos.wks.scores,row.wks)
            mGSA.p.vals.row.perm <- mGSA.p.values(pos.mGSA.scores,row.perm.mGSA)
            mAllez.p.vals.row.perm <- mAllez.p.values(pos.mAllez.scores,row.perm.mAllez)
        }



        ##### preparing output table for each of the methods with row permutation ###
        
        mGSZ.table.row <- data.frame(gene.sets = colnames(gene.sets)[mGSZ.p.vals.row.perm$class.ind],set.size = geneset.class.sizes[mGSZ.p.vals.row.perm$class.ind],gene.set.scores=pos.mGSZ.scores[mGSZ.p.vals.row.perm$class.ind],pvalue=mGSZ.p.vals.row.perm$EV.class, avg.logFC = setsFC[1,mGSZ.p.vals.row.perm$class.ind], avg.abs.logFC = setsFC[2,mGSZ.p.vals.row.perm$class.ind],row.names=NULL)
        
    
        if(other.methods==TRUE){
            mGSA.table.row <- data.frame(gene.sets = colnames(gene.sets)[mGSA.p.vals.row.perm$class.ind],set.size = geneset.class.sizes[mGSA.p.vals.row.perm$class.ind],gene.set.scores=pos.mGSA.scores[mGSA.p.vals.row.perm$class.ind],pvalue=mGSA.p.vals.row.perm$EV.class, avg.logFC = setsFC[1,mGSA.p.vals.row.perm$class.ind], avg.abs.logFC = setsFC[2,mGSA.p.vals.row.perm$class.ind],row.names=NULL)
      
            mAllez.table.row <- data.frame(gene.sets = colnames(gene.sets)[mAllez.p.vals.row.perm$class.ind],set.size = geneset.class.sizes[mAllez.p.vals.row.perm$class.ind],gene.set.scores=pos.mAllez.scores[mAllez.p.vals.row.perm$class.ind],pvalue=mAllez.p.vals.row.perm$NORM.class, avg.logFC = setsFC[1,mAllez.p.vals.row.perm$class.ind], avg.abs.logFC = setsFC[2,mAllez.p.vals.row.perm$class.ind],row.names=NULL)
        
            WRS.table.row <- data.frame(gene.sets = colnames(gene.sets)[wrs.p.vals.row$class.ind],set.size = geneset.class.sizes[wrs.p.vals.row$class.ind],gene.set.scores=pos.wrs.scores[wrs.p.vals.row$class.ind],pvalue=wrs.p.vals.row$EMP.class, avg.logFC = setsFC[1,wrs.p.vals.row$class.ind], avg.abs.logFC = setsFC[2,wrs.p.vals.row$class.ind],row.names=NULL)
        
        
            SUM.table.row <- data.frame(gene.sets = colnames(gene.sets)[sum.p.vals.row$class.ind],set.size = geneset.class.sizes[sum.p.vals.row$class.ind],gene.set.scores=pos.sum.scores[sum.p.vals.row$class.ind],pvalue=sum.p.vals.row$NORM.class, avg.logFC = setsFC[1,sum.p.vals.row$class.ind], avg.abs.logFC = setsFC[2,sum.p.vals.row$class.ind],row.names=NULL)
       
            SS.table.row <- data.frame(gene.sets = colnames(gene.sets)[ss.p.vals.row$class.ind],set.size = geneset.class.sizes[ss.p.vals.row$class.ind],gene.set.scores=pos.ss.scores[ss.p.vals.row$class.ind],pvalue=ss.p.vals.row$EMP.class, avg.logFC = setsFC[1,ss.p.vals.row$class.ind], avg.abs.logFC = setsFC[2,ss.p.vals.row$class.ind],row.names=NULL)
       
            Old.KS.table.row <- data.frame(gene.sets = colnames(gene.sets)[KS.p.vals.row$class.ind],set.size = geneset.class.sizes[KS.p.vals.row$class.ind],gene.set.scores=pos.ks.scores[KS.p.vals.row$class.ind],pvalue=KS.p.vals.row$EMP.class, avg.logFC = setsFC[1,KS.p.vals.row$class.ind], avg.abs.logFC = setsFC[2,KS.p.vals.row$class.ind],row.names=NULL)
    
            New.KS.table.row <- data.frame(gene.sets = colnames(gene.sets)[wKS.p.vals.row$class.ind],set.size = geneset.class.sizes[wKS.p.vals.row$class.ind],gene.set.scores=pos.wks.scores[wKS.p.vals.row$class.ind],pvalue=wKS.p.vals.row$EMP.class, avg.logFC = setsFC[1,wKS.p.vals.row$class.ind], avg.abs.logFC = setsFC[2,wKS.p.vals.row$class.ind],row.names=NULL)
        }
    
    
    ## Ordering of the tables for row permutation

        mGSZ.table.row <- mGSZ.table.row[order(mGSZ.table.row$pvalue,decreasing=FALSE),]
        if(other.methods==TRUE){
            mGSA.table.row <- mGSA.table.row[order(mGSA.table.row$pvalue,decreasing=FALSE),]
            mAllez.table.row <- mAllez.table.row[order(mAllez.table.row$pvalue,decreasing=FALSE),]
            WRS.table.row <- WRS.table.row[order(WRS.table.row$pvalue,decreasing=FALSE),]
            SUM.table.row <- SUM.table.row[order(SUM.table.row$pvalue,decreasing=FALSE),]
            SS.table.row <- SS.table.row[order(SS.table.row$pvalue,decreasing=FALSE),]
            Old.KS.table.row <- Old.KS.table.row[order(Old.KS.table.row$pvalue,decreasing=FALSE),]
            New.KS.table.row <- New.KS.table.row[order(New.KS.table.row$pvalue,decreasing=FALSE),]
        }
        
        if(other.methods==TRUE){
            out <- list(sample.perm =list(mGSZ = mGSZ.table.col,mGSA = mGSA.table.col,mAllez = mAllez.table.col,WRS = WRS.table.col,KS = Old.KS.table.col,wKS = New.KS.table.col,SS = SS.table.col,SUM = SUM.table.col),gene.perm=list(mGSZ = mGSZ.table.row,mGSA = mGSA.table.row,mAllez = mAllez.table.row,WRS = WRS.table.row,KS = Old.KS.table.row,wKS = New.KS.table.row, SS = SS.table.row,SUM = SUM.table.row), sample.labels = sample.labels, perm.number = perm.number, expr.data = expr.data, gene.sets = gene.sets, flip.gene.sets = flip.gene.sets, min.cl.sz = min.cl.sz, other.methods = other.methods, pre.var = pre.var, wgt2 = wgt2, wgt1 = wgt1, var.constant = var.constant, start.val = start.val, select = select, is.log = is.log, gene.perm.log = gene.perm)}
        else{
            out <- list(mGSZ.sample.perm = mGSZ.table.col, mGSZ.gene.perm = mGSZ.table.row, sample.labels = sample.labels, perm.number = perm.number, expr.data = expr.data, gene.sets = gene.sets, flip.gene.sets = flip.gene.sets, min.cl.sz = min.cl.sz, other.methods = other.methods, pre.var = pre.var, wgt2 = wgt2, wgt1 = wgt1, var.constant = var.constant, start.val = start.val, select = select, is.log = is.log, gene.perm.log = gene.perm)
            }
        return(out)


    }
    else{
        if(other.methods==TRUE){
            out <- list(mGSZ = mGSZ.table.col, mGSA = mGSA.table.col, mAllez = mAllez.table.col,WRS = WRS.table.col,KS = Old.KS.table.col,wKS = New.KS.table.col,SUM = SUM.table.col,SS = SS.table.col, sample.labels = sample.labels, perm.number = perm.number, expr.data = expr.data, gene.sets = gene.sets, flip.gene.sets = flip.gene.sets, min.cl.sz = min.cl.sz, other.methods = other.methods, pre.var = pre.var, wgt2 = wgt2, wgt1 = wgt1, var.constant = var.constant, start.val = start.val, select = select, is.log = is.log, gene.perm.log = gene.perm)
                        }
            else{
            out <- list(mGSZ = mGSZ.table.col, sample.labels = sample.labels, perm.number = perm.number, expr.data = expr.data, gene.sets = gene.sets, flip.gene.sets = flip.gene.sets, min.cl.sz = min.cl.sz, other.methods = other.methods, pre.var = pre.var, wgt2 = wgt2, wgt1 = wgt1, var.constant = var.constant, start.val = start.val, select = select, is.log = is.log, gene.perm.log = gene.perm)
            }
        return(out)
    }

}




################

rm.rows.with.noSetMembers <-
function(expr.data,gene.sets){
	tmp_sum <- apply(gene.sets, 1, sum)
    ind_wth_sets <- which(tmp_sum > 0)
    expr.data <- expr.data[ind_wth_sets,]
    gene.sets   <- gene.sets[ind_wth_sets,]
    out <- list(expr.data=expr.data, gene.sets=gene.sets)}
    
###############

rm.small.genesets <-
function(gene.sets,min.cl.sz){
    tmp_sum <- apply(gene.sets, 2, sum)
    ind_wth_sets <- which(tmp_sum > min.cl.sz)
    gene.sets   <- gene.sets[,ind_wth_sets]
	out=gene.sets}

##################

sumVarMean_calc <-
function(expr_data, gene.sets, pre.var){
	dim_sets <- dim(gene.sets)
	length_expr <- length(expr_data)
	set_sz <- colSums(gene.sets)

	unique_class_sz <- unique(set_sz)
	num_genes <- length(expr_data)
	
	divider <- c(1:num_genes)
    mean_table <- cumsum(expr_data)/divider
    mean_table_sq <- mean_table^2
    var_table <- cumsum(expr_data^2)/divider - (mean_table)^2 + pre.var
    class_sz_index <- rep(0,dim_sets[2])
    for (i in 1:dim_sets[2]){
    	class_sz_index[i] <- which(set_sz[i]==unique_class_sz)
    }
    var_table <- var_table + pre.var
    hyge_stat <- count_hyge_var_mean(dim_sets[1],unique_class_sz)
    
    z_var <- matrix(numeric(0),dim_sets[1],length(unique_class_sz)) 
    
    z_mean <- z_var
    max_value <- 1:dim_sets[1]
    for (j in 1:length(unique_class_sz)){
    	prob_sum <- count.prob.sum(dim_sets[1],hyge_stat$mean[,j],hyge_stat$var[,j])
    	z_var[,j] <- 4*(var_table*prob_sum + mean_table_sq*hyge_stat$var[,j])
    	z_mean[,j] <- mean_table*(2*hyge_stat$mean[,j]-max_value)}
        
        out = list(Z_var = z_var, Z_mean = z_mean, class_size_index = class_sz_index,var_table=var_table,mean_table_sq=mean_table_sq,set_sz=set_sz)
        return(out)
  
    
}

#########

count.prob.sum <-
function(M, hyge.mean, hyge.var){
    tulos = matrix(rep(0,length(hyge.mean)),byrow=FALSE)
    N_tab = matrix(c(2:M),byrow=FALSE)
    tulos[1] = hyge.mean[1]
    tulos[2:M] = N_tab/(N_tab-1)*hyge.mean[2:M]-(hyge.mean[2:M]^2+hyge.var[2:M])/(N_tab-1)
    tulos = tulos
}

############

count_hyge_var_mean <-
function(M,K){
    N = matrix(rep((1:M),length(K)),ncol=length(K))
    K = matrix(rep(K,M),byrow=TRUE,ncol=length(K))
    out1 = N*K/M
    out2 = N*(K*(M-K)/M^2)*((M-N)/(M-1))
    results = list(mean= out1, var= out2)
}

##############

calc_z_var <-
function(num.genes,unique_class_sz_ln,pre_z_var,wgt2,var.constant){
	ones_matrix = matrix(1,num.genes,unique_class_sz_ln)
	ones_tmatrix = t(ones_matrix)
	median_matrix = apply(pre_z_var,2,median)
	pre_median_part = ones_tmatrix*median_matrix*wgt2
	median_part = t(pre_median_part)
	z_var = (pre_z_var + median_part + var.constant)^0.5
}

#############

geneSetsList <-
function(data){
  data <- GSA.read.gmt(data)
  geneSets <- data$genesets
  names(geneSets) <- data$geneset.names
  return(geneSets)
}

Try the mGSZ package in your browser

Any scripts or data that you put into this service are public.

mGSZ documentation built on May 2, 2019, 5:53 p.m.