R/simDataGenerator.R
In BANFF: Bayesian Network Feature Finder

Documented in SimulatedDataGenerator

#' @title {Simulation dataset generator}
#' @aliases SimulatedDataGenerator
#' @name SimulatedDataGenerator
#' @usage
#' SimulatedDataGenerator(net=NULL,nnode=NULL,maxpernull=0.7,class.label=NULL,
#' missloc=NULL,missing=c(FALSE,TRUE),missrate=0.1,nonmiss.hub.maxedges=7,
#' maxnsteps.merge.communties=1000,dist=c("norm","gamma","lognorm"),
#' plot=c(TRUE,FALSE),nbin=c(20,20,20),rng=1024)
#' @description Function used to generating simulated dataset. See details in simulation studies
#' @param net The adjacent matrix with 0/1 indicating "connected" or "not directly connected. If not given, generate a scale-free graphs according to the Barabasi-Albert model by applying the BA algorithm in igraph package.
#' @param nnode Integer. Total number of gene nodes in network.
#' @param maxpernull float. Max percent of null genes in the network. Used when class.label is not given and needs to be generated during merging process. Default=0.7
#' @param class.label Vector of length(total number of nodes), giving the class indicators: -1, 0, 1 to each of the gene node. If not given, class labels are defined based on fast.greedy community detection algorithm, then merged to three sequentially based on the number of between-community-edges. Highly connected communited are merged to one first. Then the largest communities are assigned class indicator 0 as null genes. The up/down regulated class are assigned randomly.
#' @param missloc Vector. Default NULL. If given, it is the location of the test statistics that is not been observed.
#' @param missing Logical. Default FALSE. If TRUE, the missing location are generated based on missing rate.
#' @param missrate A number between (0,1). The missing rate defined as the proportion of gene nodes without observed test statistics. Not recommend over 20\% based on biological knowledge.
#' @param nonmiss.hub.maxedges Integer. Based on biological knowledge, hub genes (with higher number of neighboring edges) are less likely to be missing gene nodes. Thus it is the cutoff value where only genes with less than the nonmiss.hub.maxedges neighbors can be assigned as missing genes. Default=7
#' @param maxnsteps.merge.communties Integer. The maximum number of steps used for merging the small communities. In order to be 3, defaul=1000.
#' @param dist Char. The distribution of DE genes, can be one of the following: c("norm","gamma","lognorm"). See details in simulation design table.
#' @param plot Logical. Defaul=TRUE: whether to plot the histogram of test statistics being generated or not.
#' @param nbin Vector of length 3. Default=c(5,20,5). The number of bins used for ploting the histogram for each of the class.
#' @param rng Random seed Defaul=1024
#' @return A list:
#' \item{testcov}{test statistics, missing observations are coded as NA if any}
#' \item{testcov.fullobs}{ test statistics when all the observations are fully observed}
#' \item{class.label}{ z values for each gene, class indicators}
#' \item{net}{ simulated network, binary adjacency matrix 1/0 connected or not}
#' @details The function used for simulating test statistics:
#' \itemize{
#' \item network is given or generated by Barabasi-Albert algorithm in igraph package.
#' \item class indicators is given or generated based on fast.greedy community detection algorithm. #' \item test statistics, currently support three simulation scenario: c("norm","gamma","lognorm")
#' }
#' @examples
#' \dontrun{
#' ## The simulation settings based on real gene network. (takes time)
#' data(net)
#' data(class.label)
#' data(missloc)
#' simdata=SimulatedDataGenerator(net=net,class.label=class.label,missloc=missloc,
#' dist="norm",plot=TRUE,nbin=c(20,20,20),rng=1024)
#' str(simdata)
#' ## A toy example
#' simdata=SimulatedDataGenerator(nnode=100,missing=TRUE,missrate=0.1,dist="norm",
#' plot=TRUE,nbin=c(20,20,10),rng=1024)
#' str(simdata)
#' }
#' @export
SimulatedDataGenerator=function(net=NULL,nnode=NULL,maxpernull=0.7,class.label=NULL,missloc=NULL,missing=c(FALSE,TRUE),missrate=0.1,nonmiss.hub.maxedges=7,maxnsteps.merge.communties=1000,dist=c("norm","gamma","lognorm"),plot=c(TRUE,FALSE),nbin=c(20,20,20),rng=1024){
  set.seed(rng)
  if(is.null(net)){
    if(is.null(nnode)){
      stop("Please input an integer number as the total number of genes in the network")
    }else{
      g=igraph::sample_pa(nnode,directed=FALSE)
      net=as.matrix(igraph::get.adjacency(g))
      diag(net)=0
    }
  }
  if(is.null(class.label)){
    g=igraph::graph.adjacency(net, mode="undirected")
    graph.com=igraph::fastgreedy.community(g)
    members=table(graph.com$membership)
    memberids=graph.com$membership
    idnames=as.numeric(names(members))
    if(length(members)<3){
      stop("Too simplified case, please consider another simulation setting")
    }else{
      ng=length(members)
      while(ng>3){
        ### calculate betweenness matrix:
        btmat=matrix(0,ng,ng)
        for(i in 1:(ng-1)){
          ptloc.i=which(memberids==idnames[i])
          btmat[i,(i+1):ng]=sapply((i+1):ng,function(j){
            pt.loc.j=which(memberids==idnames[j])
            sum(net[ptloc.i,pt.loc.j])
          })
        }
        max.memberid=which(btmat==max(btmat),arr.ind=TRUE)
        pt.a.merge=apply(max.memberid,1,function(x){sum(members[x])})
        gid=max.memberid[which(pt.a.merge==max(pt.a.merge))[1],]
        ## update
        gid.change=gid[which((gid)==max(gid))]
        gid.stay=gid[which((gid)==min(gid))]

        tmp.memberids=memberids
        tmp.memberids[which(tmp.memberids==idnames[gid.change])]=idnames[gid.stay]
        tmp.members=table(tmp.memberids)

        if(length(tmp.members)<3 | max(tmp.members)> (maxpernull*nnode)){
          break;
        }else{
          memberids=tmp.memberids
          members=tmp.members
          ng=length(members)
          idnames=as.numeric(names(members))
        }
        # memberids[which(memberids==idnames[gid.change])]=idnames[gid.stay]
        # members=table(memberids)
        # ng=length(members)
        # idnames=as.numeric(names(members))
        # print(ng)
      }

      g0=idnames[which(members==max(members))]
      class.label=rep(NA,length=nnode)
      class.label[which(memberids==g0)]=0
      ng=length(members)-1
      sigloc=which(is.na(class.label))

      subnet=net[sigloc,sigloc]
      memberids=memberids[sigloc]
      members=table(memberids)
      idnames=as.numeric(names(members))

      ### random assign labels as long as balanced.
      g1=sample(idnames,1)
      n1=members[as.character(g1)]
      while(n1<(length(sigloc)/2)){
        l=sample(setdiff(idnames,g1),1)
        g1=c(g1,l)
        n1=n1+members[as.character(l)]
      }
      class.label[which(memberids%in%g1)]=1
      class.label[is.na(class.label)]=-1
    }
  }

  simRDat=vector(length=length(class.label))
  npt=table(class.label)
  if(dist=="norm"){
    simRDat[which(class.label==-1)]=stats::rnorm(npt[1],mean=-0.8,sd=0.2)
    simRDat[which(class.label==0)]=stats::rnorm(npt[2],mean=0,sd=0.2)
    simRDat[which(class.label==1)]=stats::rnorm(npt[3],mean=0.8,sd=0.2)
  }else if(dist=="gamma"){
    simRDat[which(class.label==0)]=stats::rnorm(npt[2],mean=0,sd=0.4)
    y=stats::rgamma(10*nnode,shape=2,scale=0.5)
    simRDat[which(class.label==-1)]=sample(y[which(y<(0.1+1.9))],size=npt[1],replace=FALSE)-1.9
    y=stats::rgamma(10*nnode,shape=2,scale=0.3)
    simRDat[which(class.label==1)]=-sample(y[which(y<(0.1+1.7))],size=npt[3],replace=FALSE)+1.7
  }else if(dist=="lognorm"){
    simRDat[which(class.label==0)]=stats::rnorm(npt[2],mean=0,sd=0.4)
    y=stats::rlnorm(10*nnode,meanlog=0,sdlog=1)
    simRDat[which(class.label==-1)]=sample(y[which(y<(0.1+1.9))],size=npt[1],replace=FALSE)-1.9
    y=stats::rlnorm(10*nnode,meanlog=0,sdlog=0.4)
    simRDat[which(class.label==1)]=-sample(y[which(y<(0.1+2.2))],size=npt[3],replace=FALSE)+2.2
  }

  if(plot){
    p1=graphics::hist(simRDat[which(class.label==-1)],breaks=nbin[1])
    p2=graphics::hist(simRDat[which(class.label==0)],breaks=nbin[2])
    p3=graphics::hist(simRDat[which(class.label==1)],breaks=nbin[3])
    ll=max(p1$counts,p2$counts,p3$counts)
    plot(p1, col=grDevices::rgb(0,0,1,1/4), xlim=range(simRDat),ylim=c(0,ll),main=paste0("Simulated ",dist," Cases"),xlab="Test Statistics")
    plot(p2, col=grDevices::rgb(1,0,0,1/4), xlim=range(simRDat),ylim=c(0,ll), add=TRUE)
    plot(p3, col=grDevices::rgb(0,1,0,1/4), xlim=range(simRDat),ylim=c(0,ll), add=TRUE)
    graphics::legend("topleft",c("down-regulated","null","up-regulated"),fill=c(grDevices::rgb(0,0,1,1/4),grDevices::rgb(1,0,0,1/4),grDevices::rgb(0,1,0,1/4)),bty = 'n',cex=1)
  }

  if(is.null(missloc) & missing){
    misloc=which(rowSums(net)<nonmiss.hub.maxedges)
    nmis=round(missrate*nnode)
    if(length(misloc)<round(missrate*nnode)){
      stop("Change missrate or nonmiss.hub.maxedges")
    }else{
      missloc=sample(misloc,size=nmis,replace=FALSE)
    }
  }

  if(missing){
    simRDatFull=simRDat
    simRDat[missloc]=NA
  }else{
      simRDatFull=NULL
  }


  res=list(testcov=simRDat,testcov.fullobs=simRDatFull,class.label=class.label,net=net)
  return(res)
}