R/number_probes.R

Defines functions number_probes

Documented in number_probes

##### II. Number of Features (Gene Probes) to Select For #####
#' Function to determine the number of gene probes to select for in the gene
#' feature selection process.
#' @param input String indicating the name of the file containing your
#' gene expression matrix.
#' @param data.exp The object containing your numeric gene expression matrix.
#' This matrix is an output of the input_file function previously introduced
#' in this package.
#' @param Fixed A positive integer specifying a desired number of gene probes
#' to select for. The default is set to 1000 gene probes.
#' @param Percent A positive integer between 0 and 100 indicating the
#' percentage of total gene probes to select for from the dataset.
#' @param Poly When TRUE, a mean and variance polynomial method is used to
#' determine the number of gene probes to select for. This method uses three
#' second order polynomials to select for the genes with the most variable
#' mean and standard deviations.
#' @param Adaptive When TRUE, Gaussian mixture modeling is used
#' to determine the number of gene probes to select.
#' @param cutoff Positive number between 0 and 1 specifying the false
#' discovery rate (FDR) cutoff to use with the Adaptive Gaussian mixture
#' modeling method. The default value is set to NULL. However, when Adaptive
#' is TRUE, cutoff should be a positive integer between 0 and 1. Common
#' values to use are 0.05 or 0.01.
#' @return Returns an object with the  number of gene probes that will be
#' selected in the gene feature selection process. If the Adaptive option
#' is chosen, Gaussian mixture modeling files containing information about
#' the data's mean, variance, mixing proportion, and gaussian assignment
#' are also outputted.
#' @note When using this function, the user should only use one
#' option (Fixed, Percent, Adaptive) at a time. When using one method, all
#' other options should be set to NULL.
#' @note This function is not needed to determine the number of gene probes
#' to select for in the Poly gene selection method.
#' The particular Poly method does not use a gene probe number input.
#' @seealso \code{\link{input_file}}
#' @author Peiyong Guan, Nathan Lawlor
#' @examples
#' # Example 1: Choosing a fixed gene probe number
#' # Load in a test file
#' data_file <- system.file("extdata", "GSE2034.normalized.expression.txt",
#'     package="multiClust")
#' data <- input_file(input=data_file)
#' gene_num <- number_probes(input=data_file, data.exp=data, Fixed=300,
#'     Percent=NULL, Poly=NULL, Adaptive=NULL, cutoff=NULL)
#'
#' # Example 2: Choosing 50% of the total selected gene probes in a dataset
#' gene_num <- number_probes(input=data_file, data.exp=data, Fixed=NULL,
#'     Percent=50, Poly=NULL, Adaptive=NULL, cutoff=NULL)
#'
#' # Example 3: Choosing the Poly method
#' gene_num <- number_probes(input=data_file, data.exp=data, Fixed=NULL,
#'     Percent=NULL, Poly=TRUE, Adaptive=NULL, cutoff=NULL)
#' \dontrun{
#' # Example 4: Choosing the Adaptive Gaussian Mixture Modeling method
#' # Very long computation time, so example will not be run
#' gene_num <- number_probes(input=data_file, data.exp=data, Fixed=NULL,
#'     Percent=NULL, Poly=NULL, Adaptive=TRUE, cutoff=0.01)
#'    }
#' @export
number_probes <- function(input, data.exp, Fixed=1000, Percent=NULL,
    Poly=NULL, Adaptive=NULL, cutoff=NULL) {
    # Conditional to cause error if user chose multiple methods at once
    if (is.null(Fixed) == FALSE) {
        if (is.null(Percent) == FALSE) {
            stop("Please choose one of the four options
                (Fixed, Percent, Poly, Adaptive) only")
        }
    }

    if (is.null(Fixed) == FALSE) {
        if (is.null(Adaptive) == FALSE) {
            stop("Please choose one of the four options
                (Fixed, Percent, Poly, Adaptive) only")
        }
    }

    if (is.null(Percent) == FALSE) {
        if (is.null(Adaptive) == FALSE) {
            stop("Please choose one of the four options
                (Fixed, Percent, Poly, Adaptive) only")
            }
    }

    if (is.null(Fixed) == FALSE) {
        if (is.null(Poly) == FALSE) {
            stop("Please choose one of the four options
                (Fixed, Percent, Poly, Adaptive) only")
        }
    }

    if (is.null(Percent) == FALSE) {
        if (is.null(Poly) == FALSE) {
            stop("Please choose one of the four options
                 (Fixed, Percent, Poly, Adaptive) only")
        }
    }

    if (is.null(Adaptive) == FALSE) {
        if (is.null(Poly) == FALSE) {
            stop("Please choose one of the four options
                (Fixed, Percent, Poly, Adaptive) only")
        }
    }

    # For Fixed method
    if (is.null(Fixed) == FALSE) {
        # Conditional to make sure Fixed is numeric
        if (is.numeric(Fixed) == FALSE) {
           stop("Please input numeric for Fixed")
        }
        gene_num=Fixed
        # Conditional to produce error if fixed gene number is
            # greater than total genes in matrix
        if (gene_num > dim(data.exp)[1]) {
            stop("Gene number chosen is too large")
        }
        print(paste("The fixed gene probe number is: ", gene_num, sep=""))
        return(gene_num)
    }

    # For percent method
    if (is.null(Percent) == FALSE) {
        # Conditional to produce error if Percent chosen is not greater
            # than 0 or less/equal to 1
        if (Percent < 0) {
            stop("Please choose a positive number between 0 and 100")
        }
        if (Percent > 100) {
            stop ("Please choose a positive number between 0 and 100")
        }
        gene_num2 <- round((Percent / 100) * dim(data.exp)[1])
        print(paste("The percent gene probe number is: ", gene_num2, sep=""))
        return(gene_num2)
    }

    # For Poly method
    if (is.null(Poly) == FALSE) {
        # Make data matrix numeric
        colname <- colnames(data.exp)
        data.exp <- t(apply(data.exp, 1, as.numeric))
        colnames(data.exp) <- colname

        # Get mean and standard deviation of each gene
        row.mean <- apply(data.exp, 1, mean)
        row.sd <- apply(data.exp, 1, stats::sd)
        transinput <- data.exp

        # Fit a second degree polynomial to data
        fit <- stats::lm(row.sd ~ poly(row.mean, 2, raw=TRUE))
        res <- fit$residuals
        # Determine which genes are above the fitted polynomial
        numgenesel <- length(which(res > 0))
        selgenes <- names(which(res > 0))

        # Obtain the selected gene matrix for genes above the polynomial
        vec <- NULL
        for (i in 1:length(selgenes)){
            num <- which(rownames(transinput) == selgenes[i])
            vec <- c(vec, num)
        }
        selmatrix <- transinput[vec,]

        # Get mean and sd of new selected gene matrix
        row.mean <- apply(selmatrix, 1, mean)
        row.sd <- apply(selmatrix, 1, stats::sd)

        # Repeat steps and fit second polynomial to data
        fit <- stats::lm(row.sd ~ poly(row.mean, 2, raw=TRUE))
        res <- fit$residuals
        numgenesel <- length(which(res > 0))
        selgenes <- names(which(res > 0))

        # Obtain selected gene matrix of genes above the second polynomial fit
        vec <- NULL
        for (i in 1:length(selgenes)) {
          num <- which(rownames(selmatrix) == selgenes[i])
          vec <- c(vec, num)
        }
        selmatrix2 <- selmatrix[vec,]

        # Obtain mean and sd for new genes above the second polynomial fit
        row.mean <- apply(selmatrix2, 1, mean)
        row.sd <- apply(selmatrix2, 1, stats::sd)

        # Fit a third polynomial to the data
        fit <- stats::lm(row.sd ~ poly(row.mean, 2, raw=TRUE))
        res <- fit$residuals
        numgenesel <- length(which(res > 0))
        selgenes <- names(which(res>0))

        # Obtain selected gene matrix of genes above the third polynomial fit
        vec <- NULL
        for (i in 1:length(selgenes)) {
          num <- which(rownames(selmatrix) == selgenes[i])
          vec <- c(vec, num)
        }

        # Return the number of genes
        gene_poly <- length(vec)
        print(paste("The poly gene probe number is: ", gene_poly, sep=""))
        return(gene_poly)

    }

    # For adaptive method
    if (Adaptive == TRUE) {

        if (is.numeric(cutoff) == FALSE) {
          stop("Please input numeric for FDR cutoff")
        }

        ## Code for Gaussian Mixture Modeling and Selection Process ##
        ## PART 1: NORMALIZATION OF DATA  ##

        # Function to normalize data to bring values into alignment
        nor.min.max <- function(x) {
            x.min <- min(x)
            x.max <- max(x)
            x <- (x - x.min) / (x.max - x.min)
            return (x)
        }

        # Makes Expression Data Numeric
        colname <- colnames(data.exp)
        data.exp <- t(apply(data.exp, 1, as.numeric))
        colnames(data.exp) <- colname

        # Normalization of expression data
        data.min.max <- t(apply(data.exp, 1, nor.min.max))
        rownames(data.min.max) <- rownames(data.exp)
        print("Your data has been normalized")

        ######## PART 2 Generate Simulated Data #########

        # Make normalized data a numeric matrix
        colname <- colnames(data.min.max)
        data.min.max <- t(apply(data.min.max, 1, as.numeric))
        colnames(data.min.max) <- colname

        #Stores number of columns and rows of normalized data into variables
        dsNoOfCol <- NCOL(data.min.max)
        dsNoOfRow <- NROW(data.min.max)

        #specify the names of columns in the data frame
        colnames(data.min.max) <- c(1:dsNoOfCol)
        data.min.max <- data.min.max[, 1:dsNoOfCol]

        #apply function will apply sd and mean to each value in the object
        tmp.mean <- apply(data.min.max, 1, mean)
        tmp.sd <- apply(data.min.max, 1, stats::sd)

        # rnorm generates a random distribution of numbers
        data.sim <- matrix(data=0, nrow=dsNoOfRow, ncol=dsNoOfCol)
        rownames(data.sim) <- rownames(data.min.max)

        for (i in 1:dsNoOfRow) {
            currGeneExpLevel <- stats::rnorm(dsNoOfCol, tmp.mean[i], tmp.sd[i])
            data.sim[i,] <- (currGeneExpLevel)
        }
        print("The simulated data file has been finished")

        ###### PART 3A Apply Gaussian Mixture Modeling to Real Data ######

        # Define file prefix name and other parameters for GMM
        filePrefix <- paste(input, "real_", sep=".")
        expColStart <- 1
        MAX_NO_OF_DIGITS <- 20

        dsNoOfCol <- NCOL(data.min.max)
        dsNoOfRow <- NROW(data.min.max)

        #Set column names as "number 1" through value of dsNoOfCol
        colnames(data.min.max) <- c(expColStart:dsNoOfCol)

        #Stores column numbers into a data frame
        data.min.max <- data.min.max[, expColStart:dsNoOfCol]

        #paste combines the vectors into one string
        #("number of samples: 234" for example)
        print(paste("number of samples: ", dsNoOfCol, sep=""))
        print(paste("number of genes: ", dsNoOfRow, sep=""))

        # calculating the variances
        print(paste("Begin applying Gaussian Mixture Modeling to real data",
            "may take several minutes"))

        for (i in 1:dsNoOfRow) {
            print(i)
            currGeneExpLevel <- data.min.max[i, expColStart:dsNoOfCol]

            # model the gene using Mclust
            gClust <- mclust::Mclust(t(currGeneExpLevel), G=1:6)

            output <- file(paste(filePrefix, "output_pro.txt", sep=""), "at")
            cat(rownames(data.min.max)[i], format(gClust$parameters$pro,
                digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefix, "output_mean.txt", sep=""), "at")
            cat(rownames(data.min.max)[i], format(gClust$parameters$mean,
                digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefix, "output_modelNameG.txt",
                sep=""), "at")
            cat(rownames(data.min.max)[i], gClust$G, gClust$modelName,
                file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefix, "output_sigmasq.txt",
                sep=""), "at")
            cat(rownames(data.min.max)[i],
                format(gClust$parameters$variance$sigmasq,
                       digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefix, "output_classification.txt",
                sep=""), "at")
            cat(rownames(data.min.max)[i], gClust$classification,
                file=output, sep="\t")
            cat("\n", file=output)
            close(output)
        }

        print("Gaussian mixture modeling has been applied to your real data")

        ########## PART 3B: Apply GMM to Simu Data  ############

        # Define prefix name for simu data and other parameters for GMM
        expColStart <- 1
        filePrefixsim <- paste(input, "Simu_", sep=".")
        MAX_NO_OF_DIGITS <- 20

        dsNoOfCol <- NCOL(data.sim)
        dsNoOfRow <- NROW(data.sim)

        #Set column names as "number 1" through value of dsNoOfCol
        colnames(data.sim) <- c(expColStart:dsNoOfCol)

        #Stores column numbers into a data frame
        data.sim <- data.sim[, expColStart:dsNoOfCol]

        print(paste("number of samples: ", dsNoOfCol, sep=""))
        print(paste("number of genes: ", dsNoOfRow, sep=""))

        print(paste("Begin applying Gaussian Mixture Modeling to Simulated",
            "Data, may take several minutes"))

        for (i in 1:dsNoOfRow) {
            print(i)
            currGeneExpLevel<- data.sim[i,expColStart:dsNoOfCol]

            # model the gene using Mclust
            gClust <- mclust::Mclust(t(currGeneExpLevel), G=1:6)

            output <- file(paste(filePrefixsim, "output_pro.txt",
                sep=""), "at")
            cat(rownames(data.sim)[i], format(gClust$parameters$pro,
                digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefixsim, "output_mean.txt",
                sep=""), "at")
            cat(rownames(data.sim)[i], format(gClust$parameters$mean,
                digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefixsim, "output_modelNameG.txt",
                sep=""), "at")
            cat(rownames(data.sim)[i], gClust$G, gClust$modelName,
                file=output, sep="\t")
            cat("\n", file=output)
            close(output)

            output <- file(paste(filePrefixsim, "output_sigmasq.txt",
                sep=""), "at")
            cat(rownames(data.sim)[i],
                format(gClust$parameters$variance$sigmasq,
                    digits=MAX_NO_OF_DIGITS), file=output, sep="\t")
            cat("\n", file = output)
            close(output)

            output <- file(paste(filePrefixsim, "output_classification.txt",
                sep=""), "at")
            cat(rownames(data.sim)[i], gClust$classification,
                file=output, sep="\t")
            cat("\n", file = output)
            close(output)
        }

        print(paste("Gaussian Mixture Modeling has been applied to",
            "your simulated data"))

        ####### PART 4: Determine GMM/TRank estimated number of Genes #######

        # Function that sorts a matrix
        mat.sort <- function(mat,n) {
            mat[rank(as.numeric(mat[,n]),
                ties.method="random"),] <- mat[c(1:nrow(mat)),]
            return(mat)
        }

        # Function to compute the TRank value
        compute.TRank <- function(fn.dataModelNameG,
            fn.dataClassification, fn.dataSigmasq, fn.dataMean,
            fn.dataPro,curr.Sigmasq) {
            # Model Name and G
            dataModelNameG <- utils::read.delim2(fn.dataModelNameG,
                header=FALSE, row.names=1, sep="\t", quote="\"",
                    dec=".", fill=TRUE)

            # discrete classification produced by mclust
            dataClassification <- utils::read.delim2(fn.dataClassification,
                header=FALSE, row.names=1, sep="\t", quote="\"",
                    dec=".", fill=TRUE)

            # Extract Properties of the input file
            # Number of genes
            noOfGene <- NROW(dataModelNameG)
            # Maximum number of gaussians
            maxG <-max(dataModelNameG[, 1])
            # Number of samples
            noOfSample <- NCOL(dataClassification)

            # sigma square produced by mclust
            dataSigmasq <- utils::read.delim2(fn.dataSigmasq,
                header=FALSE, row.names=1, col.names=c(1:(maxG + 1)),
                    sep="\t", quote="\"", dec=".", fill=TRUE)

            # mean produced by mclust
            dataMean <- utils::read.delim2(fn.dataMean,
                header=FALSE, row.names=1, col.names=c(1:(maxG + 1)),
                    sep="\t", quote="\"", dec=".", fill=TRUE)

            # pi-zero produced by mclust
            dataPro <- utils::read.delim2(fn.dataPro,
                header=FALSE, row.names=1, col.names=c(1:(maxG + 1)),
                    sep="\t", quote="\"", dec=".", fill=TRUE)

            # discrete classification produced by mclust
            dataClassification <- utils::read.delim2(fn.dataClassification,
                header=FALSE, row.names=1, col.names=c(1:(noOfSample + 1)),
                    sep="\t", quote="\"", dec=".", fill=TRUE)

            #-------------------------------------------------------
            # This matrix stores the gene selection criteria
            #-------------------------------------------------------
            allCriteria <- matrix(nrow=noOfGene, ncol=8)
            print("Begin computing TRank value")
            #-------------------------------------------------------
            #	Calculating the gene selection criteria
            #-------------------------------------------------------
            for (i in 1:noOfGene) {
                print(paste("Now processing Gene ", i))

                # Get the model information.
                # Extract the number of clusters and model name for each gene
                tmpMMADBID <- as.character(rownames(dataModelNameG[i,]))
                tmpG <- dataModelNameG[i, 1]
                tmpModelName <- as.character(dataModelNameG[i, 2])

                # Temporary MATRIX variables to store the statistical criteria
                tmpPWT <- matrix(data=0, nrow=tmpG, ncol=tmpG)
                #-----------------------------------------------------------
                # Calculate the MIN & MAX proportion (pi-zero)
                #-----------------------------------------------------------
                if(tmpG > 1) {
                    tmpMinPro <- min(as.numeric(dataPro[tmpMMADBID, 1:tmpG]))
                    tmpMaxPro <- max(as.numeric(dataPro[tmpMMADBID, 1:tmpG]))

                    #Added to capture the min and max mean (15 July 2011)
                    tmpMinMean <- min(as.numeric(dataMean[tmpMMADBID, 1:tmpG]))
                    tmpMaxMean <- max(as.numeric(dataMean[tmpMMADBID, 1:tmpG]))
                }

                else {
                    tmpMinPro <- 0
                    tmpMaxPro <- 0

                    #Added to capture the min and max mean (15 July 2011)
                    tmpMinMean <- min(as.numeric(dataMean[tmpMMADBID, 1]))
                    tmpMaxMean <- max(as.numeric(dataMean[tmpMMADBID, 1]))
                }

                #-----------------------------------------------
                # Experiment (9) - sum(pair-wised)
                #-----------------------------------------------

                for(j in 1:(tmpG - 1)) {
                    if(tmpModelName == "X") {
                        # do nothing here
                    }
                    else {
                        for(k in (j + 1):(tmpG - 1)) {
                            # added on 15 July 2011, otherwise wrong.
                            if(k > j) {
                                if(tmpModelName == "E") {
                                    tmpPWT[j, k] <- (abs(dataMean[tmpMMADBID,j]
                                    - dataMean[tmpMMADBID,k]) /
                                    sqrt(2 * dataSigmasq[tmpMMADBID, 1] +
                                    curr.Sigmasq)) * ((dataPro[tmpMMADBID, j] *
                                    dataPro[tmpMMADBID, k]) /
                                    ((dataPro[tmpMMADBID, j] +
                                    dataPro[tmpMMADBID, k]) ^ 2))
                                }
                                else if(tmpModelName == "V") {
                                    tmpPWT[j, k] <- (abs
                                    (dataMean[tmpMMADBID, j] -
                                    dataMean[tmpMMADBID, k]) /
                                    sqrt(dataSigmasq[tmpMMADBID, j] +
                                    dataSigmasq[tmpMMADBID, k] +
                                    curr.Sigmasq)) * ((dataPro[tmpMMADBID, j] *
                                    dataPro[tmpMMADBID, k]) /
                                    ((dataPro[tmpMMADBID, j] +
                                    dataPro[tmpMMADBID, k]) ^ 2))
                                }
                            }
                        }
                    }
                }

                # store all criteria
                allCriteria[i, 1] <- tmpMMADBID
                allCriteria[i, 2] <- tmpG
                allCriteria[i, 3] <- tmpModelName
                allCriteria[i, 4] <- tmpMinPro
                allCriteria[i, 5] <- tmpMaxPro
                #allCriteria[i, 6] = max(as.numeric(tmpPWT))# max
                allCriteria[i, 6] <- sum(as.numeric(tmpPWT))	# sum
                #Added to capture the min and max mean (15 July 2011)
                allCriteria[i, 7] <- tmpMinMean
                allCriteria[i, 8] <- tmpMaxMean
            }

            #-----------------------------------------------------------
            # sort the matrix by one of the allCriteria
            #-----------------------------------------------------------
            sortedAllCriteria <- mat.sort(allCriteria, 6)
            return (sortedAllCriteria)
        }

        #-------------------------------------------
        #	Global constants
        #-------------------------------------------
        MAX_NO_OF_DIGITS <- 20

        # Read in simu data files for TRank calculation
        tmp.fn.dataModelNameG <- paste(filePrefixsim,
            "output_modelNameG.txt", sep="")
        tmp.fn.dataClassification <- paste(filePrefixsim,
            "output_classification.txt", sep="")
        tmp.fn.dataSigmasq <- paste(filePrefixsim,
            "output_sigmasq.txt", sep="")
        tmp.fn.dataMean <- paste(filePrefixsim,
            "output_mean.txt", sep="")
        tmp.fn.dataPro <- paste(filePrefixsim,
            "output_pro.txt", sep="")

        # Calculate the fudge factor for the simu data
        aS0 <- NULL
        currData <- NULL
        for(i in 1:NROW(data.sim)) {
            currData <- as.numeric(data.sim[i,])
            aS0 <- c(aS0,stats::var(currData))
        }

        tmp.Sigmasq <- stats::median(aS0)
        print(paste("The simulated sigamasq is: ", tmp.Sigmasq, sep=""))

        # Call TRank function to calculate simulated data TRank scores
        x <- compute.TRank(tmp.fn.dataModelNameG, tmp.fn.dataClassification,
            tmp.fn.dataSigmasq, tmp.fn.dataMean, tmp.fn.dataPro, tmp.Sigmasq)
        rownames(x) <- as.character(t(x[, 1]))

        # Begin reading in real data files for TRank calculation
        tmp.fn.dataModelNameG <- paste(filePrefix,
            "output_modelNameG.txt", sep="")
        tmp.fn.dataClassification <- paste(filePrefix,
            "output_classification.txt", sep="")
        tmp.fn.dataSigmasq <- paste(filePrefix,
            "output_sigmasq.txt", sep="")
        tmp.fn.dataMean <- paste(filePrefix,
            "output_mean.txt", sep="")
        tmp.fn.dataPro <- paste(filePrefix,"output_pro.txt", sep = "")

        #calculating the fuge factor s0 for real data
        aS0 <- NULL
        currData <- NULL

        for(i in 1:NROW(data.min.max)) {
            currData <- as.numeric(data.min.max[i,])
            aS0 <- c(aS0,stats::var(currData))
        }

        tmp.Sigmasq <- stats::median(aS0)
        print(paste("The real sigamasq is: ", tmp.Sigmasq, sep=""))

        # Call TRank function to calculate TRank scores for real data
        y <- compute.TRank(tmp.fn.dataModelNameG, tmp.fn.dataClassification,
            tmp.fn.dataSigmasq, tmp.fn.dataMean, tmp.fn.dataPro, tmp.Sigmasq)
        rownames(y) <- as.character(t(y[, 1]))

        # Store TRank scores for simu (x.score) and real data (y.score)
        x.score <- as.numeric(t(x[, 6]))
        y.score <- as.numeric(t(y[, 6]))

        #FDR calculations
        maxScore <- max(c(x.score, y.score))
        currCutoff <- min(c(x.score, y.score))
        FDR <- NULL
        bin.size <- 0.001

        while(currCutoff <= maxScore) {
            a <- sum(x.score > currCutoff)
            b <- sum(y.score > currCutoff)
            currFDR <- a / b
            FDR <- rbind(FDR, c(a, b, currCutoff, currFDR))
            currCutoff <- currCutoff + bin.size
        }

        # Choose the TRank with a FDR value of 0.01 for the lowest number
            # of selected genes a represents the index of the TRank
            # value to be chosen

        FDRvalue <- cutoff
        index <- which(FDR[, 4] >= FDRvalue)

        # Conditional if FDR value never reaches specified value
        # Reduce the FDRvalue

        if (length(index) == 0) {
            index <- which(FDR[, 4] >= (FDRvalue / 10))
            if (length(index) == 0) {
                index <- which(FDR[, 4] >= (FDRvalue / 50))
                if (length(index) == 0) {
                    index <- which(FDR[, 4] >= (FDRvalue / 100))
                    if (length(index) == 0) {
                        index <- which(FDR[, 4] >= (FDRvalue / 1000))
                    }
                }
            }
        }

        genenum <- max(index)
        TRank <- FDR[genenum, 3]
        # Conditionals to make sure only one TRank value is chosen
        if (length(TRank) == 0) {
            TRank <- 0.30
            print("Arbitrary TRank value of 0.30 chosen")
            print("The specified FDR cutoff did not occur in the data")
        }

        # Conditional if more than 1 TRank value exists with the FDR cutoff
        if (length(TRank) > 1) {
            TRank <- min(TRank)
        }
        # Save the TRank determined genes to object
        sel.Probe <- as.character(y[as.numeric(y[, 6]) >= TRank, 1])
        gene_num3 <- length(sel.Probe)
        print(paste("The adaptive gene probe number is: ", gene_num3, sep=""))
        return(gene_num3)
    }

}
nlawlor/multiClust documentation built on May 16, 2019, 8:12 p.m.