R/05-generatingSimulations.R
In CloneData: Data to Support CloneSeeker Algorithm

Documented in generateMixtures generateSimulationSet

generateSimulationSet <- function(simPath, dataPath, nPerK, rounds = 400, nu = 0, pcnv = 1, norm.contam = FALSE, dataPars = NULL){
  dataPars <- list(snps.seq = 1000000, snps.cgh = 600000, mu = 70, sigma.reads = 25, sigma0.lrr = 0.15,
                   sigma0.baf = 0.03, density.sigma = 0.1)
  threshold <- 0.04
  dirs <- list.dirs(simPath)
  dirs <- dirs[-1]
  if(length(dirs)>0){
    setIDs <- as.numeric(sapply(1:length(dirs),function(j){strsplit(dirs[j], split = 'sims-')[[1]][2]}))
    setID <- max(setIDs) + 1
  } else {
    setID <- 1
  }
  if (!dir.exists(simPath)){
    dir.create(simPath)
  }
  if (!dir.exists(dataPath)){
    dir.create(dataPath)
  }
  ks <- unlist(lapply(1:length(nPerK), function(x) {rep(nPerK[x], x)}))
  psis <- Reduce(rbind, lapply(1:5, function(x) {
    mat <- sampleSimplex(nPerK[x],  x)
    cbind(mat, matrix(0, nrow = nrow(mat), ncol = 5-ncol(mat)))
  }))
  n <- nrow(psis)
  for (j in 1:nrow(psis)) {
    go <- FALSE
    while (go == FALSE) {
      tumor <- try(Tumor(psis[j,], rounds, nu, pcnv, norm.contam), silent = TRUE)
      if (!inherits(tumor, 'try-error')) {
        data <- generateTumorData(tumor,  dataPars$snps.seq,  dataPars$snps.cgh,  dataPars$mu, dataPars$sigma.reads,
                        dataPars$sigma0.lrr, dataPars$sigma0.baf, dataPars$density.sigma)
      }
      if (!inherits(tumor, 'try-error') & !inherits(data, 'try-error')) {
        go <- TRUE
      }
    }
    filename.tumor <- paste(simPath, '/', 'sim-', j, '.rda', sep = '')
    filename.data <- paste(dataPath, '/', 'dat-', j, '.rda', sep = '')
    sim <- list('tumor' = tumor)
    save(sim, file = filename.tumor)
    dat <- list('dat' = data)
    save(dat, file = filename.data)
  }
  if (nu > 0 & pcnv > 0) {
    alt.types <- 'both'
  } else if (nu > 0 & pcnv == 0) {
    alt.types <- 'mut'
  } else if (nu == 0 & pcnv > 0) {
    alt.types <- 'cnv'
  } else {
    alt.types <- 'none'
  }
  metadata <- list(tumor.params = list(rounds, nu, pcnv, norm.contam), data.params = dataPars,
                   alt.types = alt.types, date = Sys.Date(), setID = setID)
  save(metadata, file = paste(simPath, '/metadata', '.rda', sep = ''))
}


###Now code to generate a set of 300 artificial mixtures:
mix <- function(datList, psi, dataPath, mixPath, index, pos, alter=FALSE) {
  breaks <- lapply(1:22, function(j) {
    brks <- sort(unique(unlist(sapply(1:length(datList), function(k) {c(datList[[k]][datList[[k]]$chrom==j,]$loc.start, 
                                                                       datList[[k]][datList[[k]]$chrom==j,]$loc.end)}))))
    starts <- brks[1:(length(brks) - 1)]
    ends <- c(starts[2:length(starts)] - 1, brks[length(brks)])
    data.frame(start = starts, end = ends, chr = rep(j, length(starts)))
  })
  coords <- Reduce(rbind, breaks)
  columns <- sample(1:2,length(psi),replace=TRUE)
  shifts <- sapply(1:length(datList),function(j) {
    lrrs <- unlist(sapply(1:nrow(datList[[j]]), function(k) {rep(datList[[j]]$seg.median[k], datList[[j]]$num.mark[k])}))
    dens <- density(na.omit(2*10^lrrs))
    dens$x[which.max(dens$y)] - 2
  })
  d2 <- lapply(1:length(datList), function(j) {
    temp <- datList[[j]]
    res <- t(sapply(1:nrow(coords),function(k) {
      chrdat <- temp[as.character(temp$chrom) == as.character(coords$chr[k]),]
      seg <- chrdat[chrdat$loc.end>=coords$start[k] & chrdat$loc.start<=coords$end[k],]
      baf <- seg$AvgBAF
      tcn <- 2*10^(seg$seg.median)
      tcn <- tcn - shifts[j]
      if (length(which(tcn<0))>0) {
        tcn[which(tcn<0)] <- -tcn[which(tcn<0)]
      }
      X <- tcn*(1 - baf)
      Y <- tcn*baf
      if (nrow(seg)>0) {
        output <- c(X, Y)
      } else {
        output <- rep(NA, 2)
      }
      output
    }))
    colnames(res) <- c('X', 'Y')
    res
  })
  data.mixed <- as.data.frame(Reduce('+', lapply(1:length(d2), function(k) {psi[k]*d2[[k]]})))
  notna <- which(sapply(1:nrow(data.mixed), function(x) {length(which(is.na(data.mixed[x,]))) == 0}))
  for (j in 1:length(d2)) {
    d2[[j]] <- d2[[j]][notna,]
  }
  data.mixed <- data.mixed[notna,]
  starts <- coords$start[notna]
  ends <- coords$end[notna]
  chrs <- coords$chr[notna]
  data.mixed$chr <- chrs
  data.mixed$seg <- 1:length(notna)
  data.mixed$LRR <- log10((data.mixed$X + data.mixed$Y)/2)
  data.mixed$BAF <- data.mixed$X/(data.mixed$X + data.mixed$Y)
  data.mixed$markers <- unlist(sapply(1:22,function(i) {
    pos.chr <- pos[pos$Chr==i,]
    data.chr <- data.mixed[data.mixed$chr==i,]
    sapply(1:nrow(data.chr),function(j) {
      length(which(pos.chr >= starts[which(data.mixed$chr==i)[j]] & pos.chr <= ends[which(data.mixed$chr==i)[j]]))
    })
  }))
  data.mixed$start <- starts
  data.mixed$end <- ends
  data.mixed <- data.mixed[,c(3,4,5,6,1,2,7,8,9)]
  clones <- lapply(1:length(d2),function(k) {
    df <- data.frame('A'=rep(1,nrow(d2[[1]])),'B'=rep(1,nrow(d2[[1]])))
    colnames(df) <- c('A','B')
    df$chr <- data.mixed$chr
    df$start <- starts
    df$end <- ends
    df$seg <- 1:length(starts)
    df$parent.index <- rep(0, length(starts))
    df$markers <- data.mixed$markers
    df <- df[,c(3,4,5,1,2,6,7,8)]
    list('cn'=df, 'seq'=NULL)
  })
  for (k in 2:nrow(clones[[1]]$cn)) {
    bin1 <- data.mixed$X[k]==data.mixed$X[k-1] & data.mixed$Y[k]==data.mixed$Y[k-1]
    bin2 <- clones[[1]]$cn$chr[k]==clones[[1]]$cn$chr[k-1]
    if (is.na(bin1)) {
      bin1 <- FALSE
    }
    if (bin1==TRUE & bin2==TRUE) {
      for (l in 1:length(clones)) {
        clones[[l]]$cn$start[k] <- clones[[l]]$cn$start[k-1]
        clones[[l]]$cn$markers[k] <- clones[[l]]$cn$markers[k] + clones[[l]]$cn$markers[k-1]
        clones[[l]]$cn[k-1,] <- rep(NA, ncol(clones[[1]]$cn))
      }
      data.mixed$start[k] <- data.mixed$start[k-1]
      data.mixed$markers[k] <- data.mixed$markers[k-1]
      data.mixed[k-1,] <- rep(NA, ncol(data.mixed))
    }
  }
  notna <- unique(unlist(lapply(1:length(clones),function(j) {which(!is.na(clones[[j]]$cn$seg))})))
  for (k in 1:length(clones)) {
    clones[[k]]$cn <- clones[[k]]$cn[notna,]
    clones[[k]]$cn$seg <- rownames(clones[[k]]$cn) <- 1:nrow(clones[[k]]$cn)
    clones[[k]]$cn
  }
  data.mixed <- data.mixed[notna,]
  dat <- list('cn.data'=data.mixed, 'seq.data'=NULL)
  dat$cn.data <- na.omit(dat$cn.data)
  dat$cn.data$seg <- rownames(dat$cn.data) <- 1:nrow(dat$cn.data)
  if (alter) {
    alt.pools <- lapply(1:length(psi[which(psi>0)]),function(j) {
      x <- dat$cn.data$X
      y <- dat$cn.data$Y
      markers <- dat$cn.data$markers
      which(markers>1000 & x-shifts[j]/2>.95 & x-shifts[j]/2<1.05 & y-shifts[j]/2>.95 & y-shifts[j]/2<1.05)
    })
    altN <- sample(1:3,length(psi[which(psi>0)]),prob=c(.6,.3,.1),replace=TRUE)
    altered <- lapply(1:length(psi[which(psi>0)]),function(j) {sample(alt.pools[[j]],altN[j])})
    change <- lapply(1:length(psi[which(psi>0)]),function(j) {sample(c(-1,1),altN[j],replace=TRUE)})
    for (l in 1:length(psi[which(psi>0)])) {
      if (length(altered[[l]])>0) {
        for (m in 1:length(altered[[l]])) {
          allele <- sample(c(1,2),1)
          clones[[l]]$cn[altered[[l]][[m]],allele+3] <- 
            clones[[l]]$cn[altered[[l]][[m]],allele+3] + change[[l]][[m]]
          dat$cn.data[altered[[l]][[m]],allele+4] <- dat$cn.data[altered[[l]][[m]],allele+4] + 
            change[[l]][[m]]*psi[l]
          if (dat$cn.data[altered[[l]][[m]],allele+4] < 0) {
            dat$cn.data[altered[[l]][[m]],allele+4] <- -dat$cn.data[altered[[l]][[m]],allele+4]
          }
        }
      }
    }
  }
  tumor <- list('clones'=clones, 'psi'=psi, 'altered'=altered,'change'=change)
  save(dat, file=paste(dataPath, '/mixdat', '-', index, '.rda', sep=''))
  save(tumor, file=paste(mixPath, '/mixsim', '-', index, '.rda', sep=''))
}

generateMixtures <- function(dataPath, mixPath, nPerK, segmentedData, ID_pool, pos) {
  threshold <- 0.04
  psis <- Reduce(rbind, lapply(1:5, function(x) {
    mat <- sampleSimplex(nPerK[x], x)
    cbind(mat, matrix(0, nrow = nrow(mat), ncol = 5 - ncol(mat)))
  }))
  IDs <- lapply(1:nrow(psis), function(j) {
    sample(ID_pool, length(which(psis[j,]>0)), replace=FALSE)
  })
  if (!dir.exists(mixPath)) {
    dir.create(mixPath)
  }
  if (!dir.exists(dataPath)) {
    dir.create(dataPath)
  }
  metadata <- list(IDs = IDs, psis = psis)
  save(metadata, file=paste(mixPath, '/metadata.rda', sep=''))
  sapply(1:nrow(psis), function(i) {
    temp <- lapply(1:length(IDs[[i]]), function(j) {
      segmentedData[segmentedData$SamID == IDs[[i]][j],]
    })
    mix(temp , psis[i,], dataPath, mixPath, i, pos, alter=TRUE)
  })
}