R/sim_TSP_data.R

### data simulation for TSP 
### simulate the data follow a certain distribution, so that truth is known


simu.multi <- function(Gnull, Gsig, Ntrain, Ntest, SDgenemu=5, DFgenesd=1, Clow=2, Cup=Inf, MuShift=3, SDshift=0.5,seed=15213, label = c(-1,1)){
  ### input
  # Gnull - number of null genes
  # Gsig - number of significant genes
  # Ntrain - a vector for number of training samples for each class (can be multi-class)
  # Ntest - a vector for number of testing samples (same length with Ntrain)
  # SDgenemu - SD for null gene mu value
  # DFgenesd - degree of free for null gene SD
  # Clow - significant gene shift lower bound
  # Cup - significant gene shift upper bound
  # MuShift - test shift mean value
  # SDshift - test shift SD value
  # seed - seed for random number
  # label - Class label (-1,1) for binary and (-1,0,1) for multi
  
  ### output
  # Xtrain - training expression table, gene by sample
  # Ytrain - training outcome label
  # Xtest - testing expression table, gene by sample
  # Ytest - testing outcome label
  # sigIndex - significant gene index, TRUE for sig and FALSE for non-sig
  # geneMU - gene mu value, a gene-by-group matrix
  # gneeSD - gene SD value, a vector with length of gene
  
  
  set.seed(seed)
  
  Gsig=as.integer(Gsig)
  if(Gsig<=0){
    stop("Gsig - number of significant genes - has to be a positive integer")
  }
  
  Gnull=as.integer(Gnull)
  if(Gnull<=0){
    stop("Gnull - number of null genes - has to be a positive integer")
  }
  
  Ntrain=as.integer(Ntrain)
  if(length(Ntrain)<2){
    stop("Training samples need to have at least two outcomes.")
  }
  
  Ntest=as.integer(Ntest)
  if(length(Ntrain)!=length(Ntest)){
    stop("Length of Ntrain has to be equal to length of Ntest.")
  }
  
  ### prepare gene number and index
  Gall=Gsig+Gnull
  Gtotal=sum(Gall)
  GstartInd=c(1,Gsig+1)
  GendInd=c(Gsig,Gall)
  
  ### prepare sample number and index
  K=length(Ntrain)  # number of outcome categories
  Nall=Ntrain+Ntest # total number
  Ntotal=sum(Nall)
  NstartInd=rep(1,times=length(Nall)) # start index  
  NendInd=rep(Nall[1],times=length(Nall)) # end index
  for(i in 2:length(Nall)){
    NstartInd[i]=NendInd[i-1]+1
    NendInd[i]=NstartInd[i]+Nall[i]-1
  }
  
  ### null distribution
  E=matrix(0,nrow=Gtotal,ncol=Ntotal)  # expression table
  gene_mu=rnorm(Gtotal,mean=0,sd=SDgenemu)  # gene mu follows normal distribution
  gene_sd=sqrt(rchisq(n=Gtotal,df=DFgenesd))   # gene variance follows chisq distribution
  for(i in 1:Gtotal){
    E[i,]=rnorm(Ntotal,mean=gene_mu[i],sd=gene_sd[i]) 
  }
  
  ### sig gene
  shiftC=matrix(rtruncnorm(Gsig*K,a=Clow,b=Cup,mean=0,sd=SDgenemu),nrow=Gsig,ncol=K)
  direction=matrix(0,nrow=Gsig,ncol=K)  # showing the gene direction for each outcome category
  for(i in 1:Gsig){
    while(length(unique(direction[i,]))==1){  # not the sample direction
      direction[i,]=sample(label,size=K,replace=T)
    }
    
    for(j in 1:K){
      E[i,NstartInd[j]:NendInd[j]]=E[i,NstartInd[j]:NendInd[j]] + direction[i,j]*shiftC[i,j]
    }
  }
  
  ## summarize the data
  
  # sig gene index
  sigIndex=rep(FALSE,times=Gtotal)
  sigIndex[1:Gsig]=TRUE
  
  # gene effect
  geneMU=matrix(gene_mu,nrow=Gtotal,ncol=K)
  geneMU[sigIndex,]=geneMU[sigIndex,]+direction*shiftC
  
  
  ## split training and testing expression
  trainInd=c()
  for(j in 1:K){
    trainInd=c(trainInd,(NstartInd[j]:(NstartInd[j]+Ntrain[j]-1)))
  }
  
  if(length(trainInd)>0){
    Xtrain=E[,trainInd]
  }else{
    Xtrain=c()
  }
  
  if(length(trainInd)<Ntotal){
    Xtest=E[,-trainInd]
    ## add shift to test value
    for(i in 1:nrow(Xtest)){
      Xtest[i,]=Xtest[i,]+rnorm(ncol(Xtest),mean=MuShift,sd=SDshift)
    }
  }else{
    Xtest=c()
  }
  
  ## train and test y label
  Ytrain=rep(1:K,times=Ntrain)
  Ytest=rep(1:K,times=Ntest)
  
  
  return(list(Xtrain=Xtrain,Ytrain=Ytrain,Xtest=Xtest,Ytest=Ytest,
              sigIndex=sigIndex, geneMU=geneMU, geneSD=gene_sd))
  
}



#### testing the simu.multi function, very small number
Gnull=50
Gsig=10
Ntrain=c(5,5,10)
Ntest=c(5,4,8)
# SDgenemu=5
# DFgenesd=1
# Clow=2
# Cup=Inf
# MuShift=3
# SDshift=1
# seed=15213

out=simu.multi(Gnull, Gsig, Ntrain, Ntest)

### plot to check gene expression
E=out$Xtest
E=out$Xtrain
plot(NA,xlim=c(1,ncol(E)),ylim=range(E),xlab="sample",ylab="expression")
for(i in (Gsig+1):nrow(E)){
  lines(x=1:ncol(E),y=E[i,],col="grey")
}
for(i in 1:Gsig){
    lines(x=1:ncol(E),y=E[i,],col=sample(rainbow(10),1))
}
KellyCahill/rfTSP documentation built on May 21, 2019, 1:41 p.m.