R/ExClust_compute_stats.R

ExClust_compute_stats <- function(gene.IDs=NULL, gene.PVals=NULL){

    ### first we need to grab the gene names & p-values for each gene
    # grab the 1st row of each gene
    pvalue.Indices <- match(unique(gene.IDs), gene.IDs)
    # now grab the gene IDs from these rows
    NamesGenePVals <- gene.IDs[pvalue.Indices]
    # lastly grab the p-values from these rows
    GenePVals <- gene.PVals[pvalue.Indices]

    ### ExCluster progression message
    message("Running statistical tests ...","",sep="\n")

    ### data frame for testing p-values
    Catch_PVals <- data.frame(NamesGenePVals,GenePVals)
    colnames(Catch_PVals) <- c("EnsG","pvals")
    # remove NAs
    Catch_PVals <- Catch_PVals[which(complete.cases(Catch_PVals[,2])),]
    # sort PVals from lowest to highest
    Catch_PVals <- Catch_PVals[order(Catch_PVals[,2]),]

    if (nrow(Catch_PVals) >= 5000){
        ### normalize p-values
        # estimate the number of 'truly spliced genes'
        pvalCutoff <- EstNullHypo(Catch_PVals[,2])+0.005
        # estimate the number of truly spliced genes (may include some false positives)
        NumTrue <- length(which(Catch_PVals[,2] <= pvalCutoff))
        # estimate the number of null hypothesis genes
        NumNull <- length(which(Catch_PVals[,2] > pvalCutoff))

        ### if we have > 0 NumTrue p-values
        if (NumTrue > 0){
            # Now give the null hypothesis p-values uniform distriubtions
            Catch_PVals[seq((NumTrue+1),(NumTrue+NumNull)),2] <- sort(runif(n=NumNull,0,1))
            # Now divide all true p-values by a factor based on the lowest null hypothesis value
            Catch_PVals[seq(NumTrue),2] <- Catch_PVals[seq(NumTrue),2]/
                                            (Catch_PVals[NumTrue,2]/Catch_PVals[(NumTrue+1),2])
            # Now compute FDR values
            FDRs <- FDRcalc(Catch_PVals[,2],NumTrue,NumNull)
        }else{
            ### if we have no truly significant p-values from our estimate
            Catch_PVals[,2] <- sort(runif(n=NumNull,0,1))
            FDRs <- rep(1,nrow(Catch_PVals))
        }

    }else{

        ### This part of the code will be run if less than 5000 expressed genes were identified
        warning(call = ExCluster_errors$too_few_pvalues)

        # because we have too few pvalues (< 5000), we cannot run our advanced stats
        # we therefore run a basic correction with Benjamini-Hochberg
        pvalCutoff <- 0.025
        # estimate the number of truly spliced genes
        NumTrue <- nrow(Catch_PVals[Catch_PVals[,2] <= pvalCutoff,])
        # estimate the number of null hypothesis genes
        NumNull <- nrow(Catch_PVals[Catch_PVals[,2] > pvalCutoff,])
        # set the minimum value a null hypothesis p-value can be corrected to
        minNullPVal <- 0.005

        ### if we have truly differentially spliced genes
        if (NumTrue > 0){
            # Now give the null hypothesis p-values uniform distriubtions
            Catch_PVals[seq((NumTrue+1),(NumTrue+NumNull)),2] <- sort(runif(n=NumNull,minNullPVal,1))
            # Now divide all true p-values by a factor based on the lowest null hypothesis value
            Catch_PVals[seq(NumTrue),2] <- Catch_PVals[seq(NumTrue),2]/(Catch_PVals[NumTrue,2]/minNullPVal)
            # Now compute FDR values
            FDRs <- p.adjust(p = Catch_PVals[,2],method="BH")
        }else{
            ### if we have no truly significant p-values from our estimate
            Catch_PVals[,2] <- sort(runif(n=NumNull,0,1))
            FDRs <- rep(1,nrow(Catch_PVals))
        }
    }

    ### Make statistics table per gene with pvals and FDRs
    Stats_table <- data.frame(Catch_PVals,FDRs)
    return(Stats_table)
}
RMTbioinfo/ExCluster documentation built on May 15, 2019, 3:14 p.m.