R/simData.R
In optBiomarker: Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

Documented in simData

##
## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License 
## as published by the Free Software Foundation; either version 2 
## of the License, or (at your option) any later version.
##  
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General 
## Public License for more details.
##   
## You may also obtain a copy of the GNU General Public License from 
## the Free Software Foundation by visiting their Web site or by writing to
##   
##
##  Free Software Foundation, Inc.
##  59 Temple Place - Suite 330
##  Boston, MA 02111-1307
##  USA
##
################################################################################

## Function for generating nPt by nBiom data matrix for classification analysis 
## ----------------------------------------------------------------------------

## nTrain = Number of subjects in the training set (nGr1+nGr2)
## nGr1 = Number of subjects in Group 1
## nGr2 = Number of subjects in Group 2 
## nBiom = Number of biomarkers (genes, proteins)
## nRep= Number of technical replications

## sdW= sqrt(experimental or technical variation)
## sdB= sqrt(biological variation)
## rhoMax= Maximum Pearson's correlation coefficient between biomarkers
## rhoMinx= Minimum Pearson's correlation coefficient between biomarkers
##
## foldMin= Minimum value of fold changes
## nBlock= number of blocks in the block diagonal (Hub-Toeplitz) correlation matrix
## bSizes= a vector of block sizes (should sum to nBlock)
## gamma= if NULL, assume independence. gamma>=0 specify a correlation structure. gamma=0 indicates a single block exchangeable correlation marix with constant correlation rho=0.5*(rhoMin+rhoMax). A value greater than zero (e.g., gamma=1) block diagonal (Hub-Toeplitz) correlation matrix.
## sigma= Standard deviation of the normal distribution (before truncation)
##        where fold changes are generated from  
## baseExpr = A vector of length nBiom to be used as \mu_g (log2 scale, <16) 

## Main function for simulating data
## ---------------------------------


simData<-function(nTrain=100, nGr1=floor(nTrain/2), nBiom=50,nRep=3,
                  sdW=1.0, sdB=1.0 ,rhoMax=NULL, rhoMin=NULL, nBlock=NULL,bsMin=3, bSizes=NULL, gamma=NULL,
                  sigma=0.1,diffExpr=TRUE, foldMin=2,orderBiom=TRUE,baseExpr=NULL)
{
    checkInt<-c(nTrain,nGr1,nBiom, nRep)
    if(!identical(checkInt, floor(checkInt))) stop("non-integer(s) given where argument(s) should be integer valued") 

    if (!is.null(baseExpr) && length(baseExpr)!=nBiom)
    {stop("length of 'baseExpr' should be equal to nBiom") }

    if(nTrain<2) stop("training set size can not be less than 2")

    if( any(c(nGr1, nBiom, nRep)<1)) stop("'nGr1', 'nBiom' and 'nRep' can not be less than 1") 
    nGr2<-nTrain-nGr1

    if (nGr2<1) stop("'nTrain' must be greater than 'nGr1'")
    if(!is.null(rhoMax) && (rhoMax<0 || rhoMax>0.95))stop("allowed values of 'rhoMax' are between 0 and 0.95 inclusive")
    
     if (!is.null(rhoMin) && (rhoMin<0 || rhoMin>0.95))stop("allowed values of 'rhoMin' are between 0 and 0.95 inclusive")
     if (!is.null(rhoMin) && !is.null(rhoMax) && (rhoMin>rhoMax))stop("'rhoMin' should be less than or equal to 'rhoMax'")
    if (!is.null(bSizes) && sum(bSizes)!=nBiom)stop("block sizes do not add up to the nBiom")
    if (!is.null(gamma) && gamma<0) stop("'gamma' should be non-negative.")

    ## Simulate  nTrain by nBiom by nRep array of experimental errors
    ## --------------------------------------------------------------
    if (nRep>1) e<-array(rnorm(nTrain*nBiom*nRep,mean=0,sd=sdW),dim=c(nTrain,nBiom,nRep)) else
                e<-array(rnorm(nTrain*nBiom,mean=0,sd=sdW),dim=c(nTrain,nBiom))
    
    ## Average e over replicates
    if (nRep>1) eAvg<-apply(e,c(1,2),mean) else eAvg<-e
    
    ## Simulate  nTrain by nBiom matrix  of biological errors
    ## ------------------------------------------------------
    b<-matrix(rnorm(nTrain*nBiom, mean=0,sd=sdB),nTrain)
    
    ## Add biological and experimental errors
    ## ---------------------------------------
    randErr<-b+eAvg

    ## Introduce correlation structure
    ## -------------------------------

    ## Block diagonal (Hub-Toeplitz) or Exchangeable correlation structure
   
 if (nBiom>1 && !is.null(gamma)) {
    ## Define exchangeable correlation matrix
    if (gamma==0) {
        repeat{
        if (is.null(rhoMax)) rhoMax<-runif(1,min=0.6,max=0.8)
        if (is.null(rhoMin)) rhoMin<-runif(1,min=0.2,max=0.4)
        if (rhoMin>rhoMax)stop("'rhoMin' should be less than or equal to 'rhoMax'")
        Rho<-diag(rep(1,nBiom))
        Rho[upper.tri(Rho)|lower.tri(Rho)]<-0.5*(rhoMax+rhoMin)
        if(all(eigen(Rho,only.values=TRUE)[[1]]>0)) break
    }
    }
    ## Block diagonal (Hub-Toeplitz) correlation structure
    if (gamma>0){
        ## Define block number and block sizes
        if (nBiom<5)bMax<-1 else bMax<-floor(nBiom/bsMin)
        if(is.null(nBlock)) nBlock<-sample(1:bMax,1)
        if(is.null(bSizes)){
            bs<-nBiom%/%nBlock
            mod<-nBiom%%nBlock
            bs1<-bs+mod
            bSizes<-c(bs1,rep(bs,nBlock-1))
        }
        lRho<-vector("list", nBlock)
        lRho
        for (i in 1:length(bSizes)){
            lRho[[i]]<-diag(rep(1, bSizes[i]))
        }

         fill<-function(x,rhoMax, rhoMin, gamma){
           
            k<-ncol(x)
            y<-x[1,]
            y[2:k]<-rhoMax-(((2:k)-2)/(k-2))^gamma*(rhoMax-rhoMin)
            for (i in 1:(k-1)){
                x[i,(i+1):k]<-y[2:(k+1-i)]
            }
            x[lower.tri(x)]<-t(x)[lower.tri(t(x))]
           return(x)                	
        }
        
        if (nullMax<-is.null(rhoMax)) rhoMax<-runif(1,min=0.6,max=0.8)
        if (nullMin<-is.null(rhoMin)) rhoMin<-runif(1,min=0.2,max=0.4)

        lRho<-lapply(lRho, fill, rhoMax, rhoMin,gamma)
        if (nullMax || nullMin){
            repeat{
                posd<-lapply(lRho, function(x) eigen(x, only.values=TRUE)$values>0)
                if (all(unlist(posd))) break
                rhoMax<-runif(1,min=0.6,max=0.8)
                rhoMin<-runif(1,min=0.2,max=0.4)
                lRho<-lapply(lRho, fill, rhoMax, rhoMin,gamma) 
            }
            
        }
        
        Rho<-as.matrix(Matrix::bdiag(lRho))
    }
      
      ## Existing standard deviations in the data
      
      sdMat<-diag(sqrt(diag(var(randErr))))

      ## Scale the data to have unit variances
      randErrScaled<-scale(randErr,center=FALSE,scale=TRUE)
   
      ## Covariance matrix with desired Rho


      
      covMat<-sdMat%*%Rho%*%sdMat
      cholRoot<-chol(covMat) ## property: t(cholRoot)%*%cholRoot==covMat

      ## Transformed data to have desired covariance structure

      randErrTrans<-randErrScaled%*%cholRoot

      ## Replace original data by the transformed data

      randErr<- randErrTrans
}

    ## Give column names
    ## -----------------
    colnames(randErr)<-paste("Biomarker",1:nBiom, sep="")
    
    ## Add \mu_g (baseline expression) to each row of randErr
    ## ------------------------------------------------------
    
    if (is.null(baseExpr))G<-rep(5,nBiom) else G<-baseExpr
    avgData<-randErr+matrix(G, nrow=nTrain, ncol=nBiom, byrow=T)

    
    ## Introducing differential expressions
    ##-------------------------------------
     
    if(diffExpr){
    ## Generate indicators for up/down regulation
    upDown<-sample(c(-1,1),nBiom,replace = TRUE)
        
    ## Generate log2(fold-change) from half normal distribution with mean=0, sd=sigma
    
    foldChange<-rtnorm(nBiom,mean=0,sd=sigma,lower=log2(foldMin))

    ## Generate "noChange" for the healthy group from normal distribution with mean=0, sd=sigma
    ## This is just to match the variability of the data in both groups

    noChange<-rtnorm(nBiom,mean=0,sd=sigma,lower=-Inf)
    
    if(orderBiom) foldChange<-sort(foldChange)
    
    ## Make the data in the diseased group (D) to be up/down regulated
    ## by the amount foldChange
      
    diffD<-matrix(foldChange*upDown, ncol=nBiom, nrow=nGr2, byrow=T)

    diffH<-matrix(noChange, ncol=nBiom, nrow=nGr1, byrow=T)
    
    avgData[(nGr1+1):nTrain,]<-avgData[(nGr1+1):nTrain,]+diffD
    avgData[1:nGr1,]<-avgData[1:nGr1,]+diffH
      }

    ## Prepare data for classification
    ##--------------------------------

    ## Define groups (H=Healthy, D=Diseased)

    class<-factor(c(rep("H",nGr1),rep("D",nGr2)))
    data<-data.frame(class=class,avgData)
    return(list(data=data,corrMat=if(is.null(gamma)) NULL else Rho))
}

################################################################################
## End of:                         simData.R
################################################################################

Any scripts or data that you put into this service are public.

optBiomarker documentation built on Jan. 19, 2021, 1:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

optBiomarker
Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

R/simData.R
In optBiomarker: Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

Defines functions simData

Documented in simData

Try the optBiomarker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

optBiomarker Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

R/simData.R In optBiomarker: Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

Defines functions simData

Documented in simData

Try the optBiomarker package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

optBiomarker
Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules

R/simData.R
In optBiomarker: Estimation of Optimal Number of Biomarkers for Two-Group Microarray Based Classifications at a Given Error Tolerance Level for Various Classification Rules