#' Build (pure) subclones by resampling real copy-number data
#'
#' @param len The number of loci in each subclone.
#'
#' @param nbClones The number of subclones.
#'
#' @param bkps A list of vectors of breakpoint positions for each subclone.
#'
#' @param regions A list of altered regions for each subclone.
#'
#' @param dataAnnotTP A data frame containing tumor data from which the subclone
#' data are sampled, with columns: \describe{ \item{`c`}{total copy number}
#' \item{`b`}{B allele fraction} \item{`region`}{type of DNA copy number
#' alteration} \item{`genotype`}{germline genotype} } If `NULL` (default),
#' this data frame is simulated using a simple model with Gaussian errors (on
#' the scale of log-copy-numbers).
#'
#' @param dataAnnotN A data frame containing paired normal data from which the
#' subclone data are sampled. Same format as argument `dataAnnotTP`. Must not
#' be `NULL` if argument `dataAnnotTP` is not `NULL`.
#'
#' @param eps A numeric value, the signal to noise ratio for simulated data.
#'
#' @return A list of locus-level data sets for each subclone. Each data set is
#' of the format described above for argument \code{dataAnnotTP}.
#'
#' @details The subclones are designed to mimic *intra*-tumor heterogeneity. In
#' particular, all of the subclones generated by a single call to this
#' function share the same germline genotypes. Their relative frequency
#' corresponds to that of the input data.
#'
#' If \code{dataAnnotTP} is provided, then the output data are simply sampled
#' from it. Else, they are simulated as follows: (i) generate genotypes and
#' minor and major copy number profiles, (ii) transform them back into more
#' classical copy number estimates: total copy number and allele B fractions,
#' ie the format described above for argument \code{dataAnnotTP}.
#'
#' @examples
#' len <- 500*10 ## Number of loci
#' K <- 2L ## Number of subclones
#' bkps <- list(c(100, 250)*10, c(150, 400)*10)
#' regions <- list(c("(0,1)", "(0,2)", "(1,2)"), c("(1,1)", "(0,1)", "(1,1)"))
#'
#' ## Resampling read copy number data
#' dataAnnotTP <- acnr::loadCnRegionData(dataSet="GSE13372", tumorFraction=1)
#' dataAnnotN <- acnr::loadCnRegionData(dataSet="GSE13372", tumorFraction=0)
#'
#' datR <- buildSubclones(len=len, nbClones=K, bkps=bkps, regions=regions,
#' dataAnnotTP=dataAnnotTP, dataAnnotN=dataAnnotN)
#'
#' ## Simulated data
#' datS <- buildSubclones(len=len, nbClones=K, bkps=bkps, regions=regions)
#'
#' @importFrom jointseg getCopyNumberDataByResampling
#' @importFrom stats rnorm
#' @export
buildSubclones <- function(len, nbClones, bkps, regions, dataAnnotTP=NULL, dataAnnotN=NULL, eps=0.25) {
if (nbClones != length(regions)) {
stop("Argument 'nbClones' should match 'length(regions)'")
}
if (nbClones != length(bkps)) {
stop("Argument 'nbClones' should match 'length(bkp)'")
}
if (is.factor(dataAnnotTP$region)) {
dataAnnotTP$region <- as.character(dataAnnotTP$region)
}
genotype <- NULL; rm(list = "genotype")
if (is.null(dataAnnotN)) {
if (!is.null(dataAnnotTP)) {
stop("Argument 'dataAnnotN' must be provided if argument 'dataAnnotTP' is")
}
## assume equal repartition of AA, AB, BB for simplicity
pct <- rep(1, times = 3L)/3
} else {
if (is.factor(dataAnnotTP$genotype)) {
dataAnnotTP$genotype <- as.character(dataAnnotTP$genotype)
}
tbl <- table(dataAnnotN$genotype)
pct <- tbl/sum(tbl)
}
genos <- sample(c(0, 0.5, 1), size=len, prob=pct, replace=TRUE)
idxAA <- which(genos == 0)
idxAB <- which(genos == 0.5)
idxBB <- which(genos == 1)
pos <- c(idxAB, idxAA, idxBB)
if (is.null(dataAnnotTP)) {
if (!is.null(dataAnnotN)) {
warning("Argument 'dataAnnotN' will not be considered as 'dataAnnotTP' is NULL")
}
## generate data from a simple Gaussian model based on region labels,
## assuming regions are coded as "(C1, C2)", as in the acnr package
regs <- unique(unlist(regions))
pattern <- "\\(([0-9]+),([0-9]+)\\)"
C1 <- as.numeric(gsub(pattern, "\\1", regs))
C2 <- as.numeric(gsub(pattern, "\\2", regs))
if (anyNA(C1) || anyNA(C2)) {
stop("Argument 'region' should be of the form '(C1, C2)', where C1 and C2 denote minor and major copy numbers")
}
n <- 5e3 ## number of data points to be sampled from
CN1 <- rep(1, times = length(C)) ## for hets in the matched normal
CN0 <- rep(0, times = length(C)) ## for homs in the matched normal
CN2 <- rep(2, times = length(C)) ## for homs in the matched normal
C0 <- rep(0, times = length(C)) ## for Homs
C <- C1+C2 ## for Homs
## Hets
dat <- getDat(C1, C2, n, eps)
regz <- dat$region
datN <- getDat(CN1, CN1, n, eps) ## "matched normal"
datAB <- data.frame(ct=dat$c, baft=dat$b, genotype=1/2, region=regz,
cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
rm(list = c("dat", "datN"))
## symmetrization
nr <- nrow(datAB)
ww <- sample(nr, round(nr/2))
datAB[ww, "baft"] <- 1-datAB[ww, "baft"]
datAB[ww, "bafn"] <- 1-datAB[ww, "bafn"]
## Homs AA: CA=C1+C2, CB=0
dat <- getDat(C, C0, n, eps)
datN <- getDat(CN2, CN0, n, eps) ## "matched normal"
datAA <- data.frame(ct=dat$c, baft=dat$b, genotype=0, region=regz,
cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
rm(list = c("dat", "datN"))
## Homs BB: CA=0, CB=C1+C2
dat <- getDat(C0, C, n, eps)
datN <- getDat(CN0, CN2, n, eps) ## "matched normal"
datBB <- data.frame(ct=dat$c, baft=dat$b, genotype=1, region=regz,
cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
rm(list = c("dat", "datN"))
} else {
keepCols <- c("c", "b", "genotype", "region")
dataAnnot <- data.frame(dataAnnotTP[, keepCols],
dataAnnotN[, c("c", "b")])
colnames(dataAnnot) <- c("ct", "baft", "genotype", "region", "cn", "bafn")
## split by genotype to generate several subclones with the same genotypes!
datAA <- subset(dataAnnot, genotype == 0)
datAB <- subset(dataAnnot, genotype == 0.5)
datBB <- subset(dataAnnot, genotype == 1)
}
subClone <- lapply(seq_len(nbClones), FUN=function(ii) {
bkp <- bkps[[ii]]
reg <- regions[[ii]]
bkpsAB <- sapply(bkp, FUN=function(bb) max(which(idxAB <= bb)))
sim <- getCopyNumberDataByResampling(length=length(idxAB),
regData=datAB, bkp=bkpsAB,
regions=reg)
ssAB <- sim$profile
bkpsAA <- sapply(bkp, FUN=function(bb) max(which(idxAA <= bb)))
sim <- getCopyNumberDataByResampling(length=length(idxAA),
regData=datAA, bkp=bkpsAA,
regions=reg)
ssAA <- sim$profile
bkpsBB <- sapply(bkp, FUN=function(bb) max(which(idxBB <= bb)))
sim <- getCopyNumberDataByResampling(length=length(idxBB),
regData=datBB, bkp=bkpsBB,
regions=reg)
ssBB <- sim$profile
ss <- rbind(ssAB, ssAA, ssBB)
ss$pos <- pos
o <- order(pos)
ss[o, , drop=FALSE]
})
names(subClone) <- seq_along(subClone)
subClone
}
## local function to generate toy data on the C1,C2 scale
getDat <- function(C1, C2, n, eps) {
regs <- sprintf("(%s,%s)", pmin(C1, C2), pmax(C1, C2)) ## back to TRUE C1,C2 (if required)
regz <- rep(regs, each=n)
mean1 <- rep(C1, each=n)
mean2 <- rep(C2, each=n)
ntot <- n*length(regs)
c1 <- exp(rnorm(ntot, mean=log(1 + mean1), sd=eps) - 1) ## exp(log(1+.)-1) to enforce positivity
c2 <- exp(rnorm(ntot, mean=log(1 + mean2), sd=eps) - 1) ## exp(log(1+.)-1) to enforce positivity
c <- c1 + c2
d <- abs(c2 - c1)/c ## absolute value implicitly enforces c1 < c2
b <- 1/2 + (c2 - c1)/c/2
dat <- data.frame(c=c, b=b, region=regz, stringsAsFactors=FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.