########################################################################
### Part 1: Simplices
###
### The d-dimensional simplex is the set of nonnegative points in R^d
### that satisfy x_1 + .. + x_d = 1. These points represent partitions
### of data into subsets, such as taking the fraction of cells that are
### part of each clone.
### Sample n points from the d-simplex
sampleSimplex <- function(n, d = 5) {
result <- matrix(NA, nrow=n, ncol = d)
for (i in 1:n) {
result[i,] <- diff(sort( c(0, 1, runif(d - 1, 0, 1)) ))
}
result
}
### Generate a regularly spaced lattice of points in the d-simplex,
### sapnned by k points along each edge.
### Use symmetry to reduce the number we need to try.
generateSimplex <- function(k, d, reps=1){
simplex <- t(xsimplex(d, k))
for(i in 1:nrow(simplex)){
simplex[i,] <- sort(simplex[i,], decreasing=TRUE)
}
simplex <- unique(simplex)
psis <- t(sapply(rep(1:nrow(simplex), reps), function(i){simplex[i,]/k}))
psis
}
########################################################################
### Part 2: WeightVectors
###
### We also refer to points in a simplex as "weight vectors" (since
### "vector of clonal fractions" is both too long and too specialized).
### We define these as an S4 class so we can build validitiy checking
### into the constructor, and so we can more reliably use them when we
### generate "Tumor" objects later.
setClass("WeightVector", slots=c(psi = "numeric"))
setValidity("WeightVector", function(object) {
all( object@psi >= 0 ) & sum(object@psi) == 1
})
setMethod("initialize", "WeightVector", function(.Object, psi = 1, ...) {
.Object <- callNextMethod(.Object) # in case this gets inherited
if (any(is.na(psi))) {
stop("Psi should not contain missing values.")
}
if (any(psi < 0)) {
stop("Psi should not contain negative values.")
}
if (all(psi == 0)) {
stop("At least one psi component must be larger than zero.")
}
.Object@psi = psi/sum(psi)
.Object
})
WeightVector <- function(phi) {
new("WeightVector", psi = phi)
}
setAs("WeightVector", "numeric", function(from) from@psi)
########################################################################
# Part 3: Simulating Data
############ CLONE ############
# A clone is a list of segments, posibly with abnormalities.
setClass("Clone", representation=list(
segments = "integer",
weights = "numeric"
))
# We should be able to simulate a clone if we know the number of segments,
# the relative frequency/prevalence of each pure compartment.
Clone <- function(nSegments, weights=rep(1/5, 5), segnames=NULL) {
# nSegments = integer, the number of segments
# weights = vector, the prevalence of each compartment
if (nSegments < 1) stop("Number of segments must be positive.")
# start with sanity checks on the weights
weights <- as(WeightVector(weights), "numeric")
# now sample the compartments to generate a clone
segs <- sample(length(weights), nSegments, replace=TRUE, prob=weights)
if (is.null(segnames)) segnames <- paste("Segment", 1:nSegments, sep='')
if (length(segnames) != length(segs)) stop("Wrong number of segment names.")
names(segs) <- segnames
new("Clone", segments=segs, weights=weights)
}
############ TUMOR ############
# A tumor is a set of clones, each of which is associated with a
# fraction, subject to the constraint that the sum of the fractions
# equals one.
setClass("Tumor",
slots = c(psi = "WeightVector",
clones = "list",
tree = "list"))
setAs("Tumor", "list", function(from) {
list(psi = as(from@psi, "numeric"),
clones = from@clones,
tree = from@tree)
})
setAs("list", "Tumor", function(from) {
new("Tumor",
psi = WeightVector(from$psi),
clones = from$clones,
tree = from@tree)
})
setMethod("summary", signature("Tumor"), function(object, ...) {
tdata <- object@data
table(A=tdata[,1], B=tdata[,2])
})
getClone <- function(tumor, i) {
if (inherits(tumor, "Tumor")) {
tumor <- as(tumor, "list")
}
tumor$clones[[i]] # should we do error checking on 'i'?
}
### TODO: Wrap everything except psi into some sort of parameter class.
Tumor <- function(psi, rounds, nu=100, pcnv=0.5, norm.contam=FALSE, cnmax=4, mutationProbs=NULL, genes=NULL, labelSize=1) {
if(!exists('chlens')){
data('chlens')
}
K <- length(which(psi > 0)) # number of clones
## Choose the number of copy number (CN) segments.
total.segs <- round(runif(1, 250, 500))
## Distribute segments across the chromosomes.
segsperchr <- as.vector(rdirichlet(1, chlens/1000000)) # per megabase?
segsperchr <- round(segsperchr*total.segs)
segsperchr[segsperchr < 1] <- 1
## Start building a fake DNA copy data frame. Each segment
## needs a chromosome with start and end positions.
chr <- unlist(lapply(1:24, function(i) {
rep(i, segsperchr[i])
}))
ends <- lapply(1:24, function(i) {
lens <- c(as.vector(rdirichlet(1, rep(1, segsperchr[i]))))
sapply(1:length(lens), function(j) {sum(lens[1:j])})
})
ends <- lapply(1:length(ends), function(i) {
round(chlens[chr[i]]*ends[[i]])
})
starts <- lapply(1:length(ends), function(i) {
c(1, ends[[i]][1:(length(ends[[i]])-1)]+1)
})
ends <- unlist(ends)
starts <- unlist(starts)
## Make the data frame.
cnmat <- cbind(chr, unlist(starts), unlist(ends), rep(1, length(starts)),
rep(1, length(starts)), 1:length(starts), rep(NA, length(starts)))
colnames(cnmat) <- c('chr', 'start', 'end', 'A', 'B', 'seg', 'parent.index')
#If there are specified mutations:
if(!is.null(mutationProbs)){
mutation.probs <- mutationProbs@mutation.probs
mutset <- rownames(mutation.probs)
L <- lapply(2:length(mutset),function(j){as.numeric(strsplit(mutset[j],split='-')[[1]])})
mut.chrs <- sapply(1:length(L),function(j){L[[j]][1]})
mut.loci <- sapply(1:length(L),function(j){L[[j]][2]})
}
## Evolve some clones, starting with the currently normal one.
newclone.seq <- matrix(numeric(0),ncol=7,nrow=0)
colnames(newclone.seq) <- c('chr', 'start', 'seg', 'mut.id', 'gene', 'mutated.copies', 'allele')
startclone <- list('cn'=as.data.frame(cnmat), 'seq'=as.data.frame(newclone.seq))
id.end <- 0
parent.indices <- c()
parentPool <- 1
clones <- list(startclone)
while(length(clones) <= rounds) {
parent.index <- sample(parentPool, 1)
parent.indices[length(clones)] <- parent.index
parent <- clones[[parent.index]]
if(nu > 0 | !is.null(mutationProbs)) { # Add some mutations. Possibly CNV as well
cnv <- sample(c(TRUE, FALSE), 1, prob = c(pcnv, 1 - pcnv))
} else { # Must be a CNV
cnv <- TRUE
}
newclone.cn <- parent$cn
newclone.cn[, which(colnames(newclone.cn)=='parent.index')] <- parent.index
newclone.seq <- parent$seq
## handle copy number variants
if(cnv) {
allele <- sample(c('A', 'B'), 1)
pool <- which(parent$cn[, which(colnames(cnmat)==allele)] > 0 &
parent$cn[, which(colnames(cnmat)==allele)]<cnmax)
tochange <- sample(pool, 1)
other.clones <- sapply(1:length(clones), function(j) {
clones[[j]]$cn[tochange, which(colnames(cnmat)==allele)]
})
if(max(other.clones) > 1) {
delta <- 1
} else if(min(other.clones) < 1) {
delta <- -1
} else {
delta <- sample(c(-1, 1), 1)
}
if(parent.index>1 & length(which(!is.na(unlist(newclone.seq)))) > 0) {
muts.tochange <- intersect(which(newclone.seq[, i.seg] == tochange),
which(newclone.seq[, i.allele] == allele))
if(length(muts.tochange) > 0) {
newclone.seq[muts.tochange, i.cn] <-
as.numeric(newclone.seq[muts.tochange, i.cn]) + delta
}
}
newclone.cn[tochange, which(colnames(cnmat)==allele)] <-
as.numeric(newclone.cn[tochange, which(colnames(cnmat)==allele)]) + delta
} #END: if(cnv)
##specified mutations:
if(!is.null(mutationProbs)){
preclude <- which((mut.loci %in% as.numeric(as.character(parent$seq[,2]))) &
(mut.chrs %in% as.numeric(as.character(parent$seq[,1]))))
if(length(which(parent.indices==parent.index))>1){
priorChildren <- which(parent.indices==parent.index)[1:(length(which(parent.indices==parent.index))-1)] + 1
priorMuts <- unique(unlist(lapply(priorChildren,function(x){
which((mut.loci %in% as.numeric(as.character(clones[[x]]$seq[,2]))) &
(mut.chrs %in% as.numeric(as.character(clones[[x]]$seq[,1]))))
})))
preclude <- sort(unique(c(preclude, priorMuts)))
}
if(length(preclude)>0){
pool <- (1:length(mutset))[-preclude]
}else{
pool <- 1:length(mutset)
}
probs <- mutation.probs[1,]
probs[preclude] <- 0
zeroOut <- unlist(sapply(1:length(preclude),function(j){unique(as.vector(which(mutation.probs[j+1,]==0)))}))
if(length(zeroOut)>0){
probs[zeroOut] <- 0
}
if(length(which(probs>0))>0){
newMut <- sample(1:length(mut.loci),1,prob = probs)
}else{
newMut <- numeric(0)
}
newMutLocus <- mut.loci[newMut]
newMutChr <- mut.chrs[newMut]
}else{
mut.loci <- newMutLocus <- mut.chrs <- newMutChr <- numeric(0)
}
##unspecified mutations
nmuts <- round(runif(1, nu/2, 1.5*nu))
if(nmuts > 0 | !is.null(mutationProbs)){
mutsperchr <- as.vector(rmultinom(1, nmuts, chlens/1000000))
starts <- lapply(1:length(mutsperchr), function(i) {
unique(round(sort(runif(mutsperchr[i], 1, chlens[i]))))
})
starts <- unlist(starts)
mut.chrs.new <- unlist(lapply(1:length(mutsperchr), function(i) {
rep(i, mutsperchr[i])
}))
remove <- which(starts %in% mut.loci)
if(length(remove) > 0){
starts <- starts[-remove]
mut.chrs.new <- mut.chrs[-remove]
}
starts <- c(starts,newMutLocus)
mut.chrs.new <- c(mut.chrs.new,newMutChr)
mut.segs <- unlist(sapply(1:length(starts), function(i) {
col.index <- which(colnames(cnmat)=='chr')
tab <- cnmat[which(cnmat[, col.index]==mut.chrs.new[i]), ]
bin1 <- which(tab[, which(colnames(tab)=='start')]<=starts[[i]])
index <- bin1[which(tab[bin1, which(colnames(tab)=='end')]>=starts[[i]])]
seg <- unname(tab[index, which(colnames(tab)=='seg')])
seg
}))
alleles <- sample(c('A', 'B'), length(starts), replace=TRUE)
mut.ids <- (id.end+1):(id.end+length(starts))
genenames <- rep(NA,length(starts))
if(!is.null(genes)){
genenames <- sapply(1:length(starts),function(z){
gn <- genes$name[which(genes$chr==mut.chrs.new[z] & genes$start<=starts[z] & genes$end>=starts[z])]
if(length(gn)==0){
gn <- NA
}
gn
})
}
if(!is.null(mutationProbs)){
if(!is.null(mutationProbs@genenames)){
genenames[which(starts %in% mut.loci & mut.chrs.new %in% mut.chrs)] <-
mutationProbs@genenames[which(mut.loci %in% starts & mut.chrs %in% mut.chrs.new)]
}
}
seqmat <- cbind(mut.chrs.new, starts, mut.segs, mut.ids, genenames, rep(1, length(starts)), alleles)
id.end <- max(mut.ids)
colnames(seqmat) <- c('chr', 'start', 'seg', 'mut.id', 'gene', 'mutated.copies', 'allele')
newclone.seq <- rbind(newclone.seq, seqmat)
rownames(newclone.seq) <- NULL
} else {
newclone.seq <- matrix(numeric(0),ncol=7,nrow=0)
colnames(newclone.seq) <- c('chr', 'start', 'seg', 'mut.id', 'gene', 'mutated.copies', 'allele') # No new mutations
} #END: if(nmuts > 0)
## Make the new clone, remembering what changed.
if (length(clones) == 1 & nmuts > 0) {
i.seg <- which(colnames(seqmat) == 'seg')
i.cn <- which(colnames(seqmat) == 'mutated.copies')
i.allele <- which(colnames(seqmat) == 'allele')
}
newclone <- list('cn'=as.data.frame(newclone.cn), 'seq'=as.data.frame(newclone.seq))
clones <- c(clones, list(newclone))
if(!is.null(mutationProbs)){
parentPool <- which(sapply(1:length(clones),function(x){
thisCloneMuts <- clones[[x]]$seq$start
thisCloneMuts <- thisCloneMuts[which(thisCloneMuts %in% mut.loci & clones[[x]]$seq$chr %in% mut.chrs)]
children <- which(parent.indices==x)
childMuts <- unique(unlist(lapply(children,function(y){
unique(clones[[y]]$seq$start)
})))
length(which(thisCloneMuts %in% childMuts)) < length(thisCloneMuts)
}))
}else{
parentPool <- 1:(length(clones)+1)
}
} #END: while(length(clones) <= rounds)
## This is still safe when K == 1.
if(norm.contam == TRUE) { # first "clone" consists of the normal cells.
sampled <- c(1, sample(2:length(clones), K-1, replace=FALSE))
}else{
sampled <- sample(2:length(clones), K, replace=FALSE)
}
clones.final <- lapply(1:length(sampled), function(I) {
cndf <- as.data.frame(clones[[sampled[I]]]$cn)
if(length(which(!is.na(unlist(clones[[sampled[I]]]$seq))))>0){
seqdf <- as.data.frame(clones[[sampled[I]]]$seq)
for(J in which(colnames(seqdf)!='allele')) {
rownames(seqdf) <- NULL
}
seqdf$normal.copies <- na.omit(unlist(lapply(1:nrow(cndf), function(J) {
total.cn <- cndf$A[J] + cndf$B[J]
if(length(which(seqdf$seg == J)) > 0){
normal.cns <- total.cn - as.numeric(as.character(seqdf$mutated.copies[which(seqdf$seg == J)]))
} else {
normal.cns <- NA
}
normal.cns
})))
} else {
seqdf <- NA
}
if(nu > 0 | !is.null(mutationProbs)){
output <- list('cn'=cndf, 'seq'=seqdf)
}else{
output <- list('cn'=cndf)
}
output
})
#First clone in set of clones is normal cells, which we'll index as 0:
parent.indices <- parent.indices
parent.indices <- c(NA, parent.indices)
distinguishingGenes <- lapply(1:length(clones),function(j){
if(j==1){
genes <- c()
}else{
thisCloneGenes <- as.character(na.omit(unique(clones[[j]]$seq$gene)))
parentCloneGenes <- as.character(na.omit(clones[[parent.indices[j]]]$seq$gene))
genes <- thisCloneGenes[which(!thisCloneGenes %in% parentCloneGenes)]
genes
}
genes
})
if(length(clones.final)==1){
cloneTree <- NA
}else{
tree.df <- as.data.frame(t(sapply(2:length(clones),function(x){
asNum <- c(parent.indices[x],x) - 1
if(length(distinguishingGenes[[x]])==1){
gene.child <- distinguishingGenes[[x]]
childname <- paste('Clone ',asNum[2],' (',gene.child,')',sep='')
}else{
childname <- paste('Clone ',asNum[2],sep='')
}
if(length(distinguishingGenes[[parent.indices[x]]])==1){
gene.parent <- distinguishingGenes[[parent.indices[x]]]
parentname <- paste('Clone ',asNum[1],' (',gene.parent,')',sep='')
}else{
parentname <- paste('Clone ',asNum[1],sep='')
}
if(asNum[1]==0){
parentname <- 'Normal'
}
c('Parent'=parentname,'Child'=childname)
})))
nodes <- as.character(unlist(tree.df))
nodeNumber <- as.numeric(sapply(nodes,function(node){strsplit(strsplit(node,split="Clone ")[[1]][2],split="[ (]")[[1]][1]})) + 1
nodeNumber[is.na(nodeNumber)] <- 1
nodeNumber <- unique(nodeNumber)
colors <- rep('gray',length(unique(unlist(tree.df))))
extant <- which(nodeNumber %in% sampled)
colors[extant] <- 'red'
tree.obj <- graph.data.frame(tree.df)
V(tree.obj)$color <- colors
V(tree.obj)$arrow.mode <- 0
V(tree.obj)$label.cex <- labelSize
V(tree.obj)$label.degree <- 1.2
V(tree.obj)$label.dist <- 2
cloneTree <- list('tree.obj'=tree.obj,'tree.df'=tree.df)
}
new("Tumor", clones = clones.final, psi = WeightVector(psi), tree = cloneTree)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.