simulation/generate_data/SimulationFunction.R

source("GenerateMutationTable.R")
## Simulations are different cause we add the nonsilent mutation with extrem case, type of highest mutation rate/lowest mutation rate
load("Bij.rdata") #load("bijAndQvec.Rdata")
load("pat.rate.list.Rdata")
load("ov.simulation.table.rdata")#load("final.table.Rdata")
#####################################
bij <- bijAndQvec$bij
rm(bijAndQvec)
G0 = length(bij)
N0 = length(bij[[1]])
Bm = matrix(rep(0, G0 * N0), ncol = N0)
for (g in 1:G0) {
  Bm[g, ] <- bij[[g]]
}
############################# find the names of genes
gene.rep.expr.file = "gene.rep.expr.RData"
gene.ptr <- load(gene.rep.expr.file)
gene.rep.expr <- get(gene.ptr)
rm(gene.ptr)
gid <- unlist(sapply(gene.rep.expr, function(x) x[[1]]))
###########################################
sample.name = names(bij[[1]])
####################################################
rownames(Bm) <- gid
colnames(Bm) <- sample.name
###############################################
use.id = which(rowSums(is.na(Bm)) == 0)
Bmat = Bm[use.id, ]
EtaMat = 1 - Bmat
mut.gene.name1 = rownames(EtaMat)


##############################
## seed.pat = set seed for picking patients
##nPat = # of patient picked for simulation < total number of patients
## frac = fraction of three mutually exclusive genes
## NoiseGeneNum = # of noise gene added
## extrem.ind.mut = minimun # of noise mutations that we want for noise genes
## trialNum = trials name
TP53 = "ENSG00000141510"
SimulGen <- function(seed.pat,nPat,NumofaddPat=NumofaddPat,frac=frac,NoiseGeneNum=17,extrem.ind.mut=5,trialNum=101,
                     EtaMat=EtaMat,pat.name=pat.name,final.table=final.table,mut.gene.name1=mut.gene.name1){
  ##################################random pick 400 patients here!!!!!!!!!!!
  pat.name = colnames(EtaMat)
  set.seed(seed.pat)
  pat.pick = sample(pat.name, nPat)
  final.table = final.table[final.table$Tumor_Sample_Barcode %in% pat.pick,]  ## only look at these pick patients' mutations
  ############################### find all genes that have at least 2 mutations
  atleast2.pos = which(table(final.table[, 1]) >= 2 & table(final.table[, 1]) <= 100)
  mut.gene.name2 <- names(table(final.table[, 1])[atleast2.pos])
  ###################### All the genes that have at least 2 mutations and no NA values of rate
  mut.gene.name = intersect(mut.gene.name1, mut.gene.name2)
  EtaMat  = EtaMat[mut.gene.name,pat.pick]     #update EtaMat for only picked patients
  ############################################################
  ############################################################
  ############################################################
  NumofaddPat = NumofaddPat
  listof.pat.rate.list.test <- NULL
  listof.pat.mut.list.test <- NULL
  listof.candi.gene <- NULL
  BigObsMat.list <- NULL
  mut.gene.mat <- matrix(rep(NA,3*100),ncol=100)
  frac = frac
  ########### random pick three genes for mutually exclusive pattern
  for (iter in 1:100) {
    set.seed(iter)
    mut.gene = sample(setdiff(mut.gene.name,TP53), 3)
    mut.gene.mat[,iter]<-mut.gene
    #############################################################################################
    mut.gene.pos = which(final.table[, 1] %in% mut.gene)  # pos of mut.gene in table
    exist.pat = unique(final.table[mut.gene.pos, 8])  # pat already have mutation
    NumofMut = table(final.table[mut.gene.pos, 1])  # How many mutations already have for each gene
    mut.gene.reorder = names(NumofMut)
    #################################################
    set.seed(iter)
    add.pat = sample(setdiff(pat.pick, exist.pat), NumofaddPat)
    pos.mut = lapply(mut.gene.reorder,function(x)(which(final.table[,1]==x)))
    i1 = as.integer(frac*NumofaddPat)[1]
    i2 = as.integer(frac*NumofaddPat)[2]
    i3 = NumofaddPat -i1 - i2
    add.pos.table1=NULL
    add.pos.table2=NULL
    add.pos.table3=NULL
    final.tableG1 = final.table[pos.mut[[1]],]
    final.tableG2 = final.table[pos.mut[[2]],]
    final.tableG3 = final.table[pos.mut[[3]],]
    len1 = dim(final.tableG1)[1]; len2 = dim(final.tableG2)[1]; len3 = dim(final.tableG3)[1]
    temp1 = do.call("rbind",rep(list(final.tableG1),as.integer(i1/len1)+1))
    add.pos.table1 = temp1[1:i1,]
    temp2 = do.call("rbind",rep(list(final.tableG2),as.integer(i2/len2)+1))
    add.pos.table2 = temp2[1:i2,]
    temp3 = do.call("rbind",rep(list(final.tableG3),as.integer(i3/len3)+1))
    add.pos.table3 = temp3[1:i3,]
    #############################################
    add.pos.table = rbind(add.pos.table1, add.pos.table2, add.pos.table3)
    ####
    add.pos.table[, 8] <- add.pat
    simu.final.table = rbind(final.table, add.pos.table)  # simulated mutation table
    ###################### Organize it to be the form used in TGscore function
    gen.name = rownames(pat.rate.list[[1]])
    pat.mut.list = Reorganize.pat.mut(final.table = simu.final.table, gen.name = gen.name, Pat.name = pat.pick)
    ### load data############## the data have: pat.mut.list simu.final.table mut.gene NumofMut add.pat atleast2.gene.id = which(
    ### rowSums(Reduce('+',pat.mut.list))>=2 ) get the genes have at least 2 mutations and not NA mutation rate
    pat.mut.list <- lapply(pat.mut.list, function(x) x[mut.gene.name, ])
    pat.rate.list = pat.rate.list[pat.pick]
    pat.rate.list <- lapply(pat.rate.list, function(x) x[mut.gene.name, ])
    ############################
    gen.name = rownames(pat.mut.list[[1]])
    ### Generate big observation table
    temp.mut = lapply(pat.mut.list, function(x) as.numeric(rowSums(x) > 0))
    BigObsMat = do.call(cbind, temp.mut)
    rownames(BigObsMat) <- gen.name
    colnames(BigObsMat) <- pat.pick
    BigObsMat.list[[iter]] <- BigObsMat
    #######################################
    mutual.ind = which(gen.name %in% mut.gene)
    #######################################
    extrem.ind = which(rowSums(BigObsMat) >=extrem.ind.mut & rowSums(BigObsMat)<=nPat*0.5)
    #######################################
    set.seed(iter)
    ind = c(mutual.ind, sample(setdiff(extrem.ind, mutual.ind), NoiseGeneNum))
    ###################################
    temp.pat.rate.list.test = lapply(pat.rate.list, function(x) x[ind, ])
    temp.pat.mut.list.test = lapply(pat.mut.list, function(x) x[ind, ])
    candi.geneTemp = rownames(temp.pat.mut.list.test[[1]])
    listof.candi.gene[[iter]] <- candi.geneTemp
    listof.pat.rate.list.test[[iter]] <- temp.pat.rate.list.test
    listof.pat.mut.list.test[[iter]] <- temp.pat.mut.list.test
  }
  ########################
  filename = paste("simulated.data",trialNum,"Pat",NumofaddPat,".Rdata",sep="")
  save(seed.pat,NumofaddPat,frac,listof.pat.mut.list.test, listof.pat.rate.list.test,
       mut.gene.mat,NumofMut, BigObsMat.list, EtaMat,listof.candi.gene,file = filename)
  ########################
  listRes = list(NumofaddPat=NumofaddPat,frac=frac,EtaMat=EtaMat,mut.gene.mat=mut.gene.mat,
                 listof.pat.mut.list.test=listof.pat.mut.list.test,listof.pat.rate.list.test=listof.pat.rate.list.test,
                 BigObsMat.list=BigObsMat.list,listof.candi.gene=listof.candi.gene)
  #########################
  return(listRes)
}



#################Creat simulation setting:
##############################
## seed.pat = set seed for picking patients
##nPat = # of patient picked for simulation
##NumofaddPat = # of patients that are imposed the mutually exclusive pattern on.
## frac = fraction of three mutually exclusive genes
## NoiseGeneNum = # of noise gene added
## extrem.ind.mut = minimun # of noise mutations that we want for noise genes
## trialNum = trials name
Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=20,frac=c(1/3,1/3,1/3),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=50,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)

Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=40,frac=c(1/3,1/3,1/3),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=51,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)

Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=60,frac=c(1/3,1/3,1/3),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=52,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)

Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=20,frac=c(1/2,1/3,1/6),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=53,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)

Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=40,frac=c(1/2,1/3,1/6),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=54,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)

Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=60,frac=c(1/2,1/3,1/6),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=55,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)



Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=80,frac=c(1/3,1/3,1/3),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=80333,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)


Res = SimulGen(seed.pat=12345,nPat=200,NumofaddPat=80,frac=c(1/2,1/3,1/6),NoiseGeneNum=17,extrem.ind.mut=5,trialNum=80236,
               EtaMat=EtaMat,pat.name=pat.name,final.table = final.table, mut.gene.name1=mut.gene.name1)
MarkeyBBSRF/MEScan documentation built on Nov. 19, 2020, 4:49 a.m.