c3co: Cancer Cell Clonality using Copy-Number

Documented in buildSubclones

#' Build (pure) subclones by resampling real copy-number data
#'
#' @param len The number of loci in each subclone.
#'
#' @param nbClones The number of subclones.
#'
#' @param bkps A list of vectors of breakpoint positions for each subclone.
#'
#' @param regions A list of altered regions for each subclone.
#'
#' @param dataAnnotTP A data frame containing tumor data from which the subclone
#'   data are sampled, with columns: \describe{ \item{`c`}{total copy number}
#'   \item{`b`}{B allele fraction} \item{`region`}{type of DNA copy number
#'   alteration} \item{`genotype`}{germline genotype} } If `NULL` (default),
#'   this data frame is simulated using a simple model with Gaussian errors (on
#'   the scale of log-copy-numbers).
#'
#' @param dataAnnotN A data frame containing paired normal data from which the
#'   subclone data are sampled. Same format as argument `dataAnnotTP`. Must not
#'   be `NULL` if argument `dataAnnotTP` is not `NULL`.
#'
#' @param eps A numeric value, the signal to noise ratio for simulated data.
#'
#' @return A list of locus-level data sets for each subclone. Each data set is
#'   of the format described above for argument \code{dataAnnotTP}.
#'
#' @details The subclones are designed to mimic *intra*-tumor heterogeneity. In
#'   particular, all of the subclones generated by a single call to this
#'   function share the same germline genotypes. Their relative frequency
#'   corresponds to that of the input data.
#'
#'   If \code{dataAnnotTP} is provided, then the output data are simply sampled
#'   from it. Else, they are simulated as follows: (i) generate genotypes and
#'   minor and major copy number profiles, (ii) transform them back into more
#'   classical copy number estimates: total copy number and allele B fractions,
#'   ie the format described above for argument \code{dataAnnotTP}.
#'
#' @examples
#' len <- 500*10  ## Number of loci
#' K <- 2L        ## Number of subclones
#' bkps <- list(c(100, 250)*10, c(150, 400)*10)
#' regions <- list(c("(0,1)", "(0,2)", "(1,2)"), c("(1,1)", "(0,1)", "(1,1)"))
#'
#' ## Resampling read copy number data
#' dataAnnotTP <- acnr::loadCnRegionData(dataSet="GSE13372", tumorFraction=1)
#' dataAnnotN <- acnr::loadCnRegionData(dataSet="GSE13372", tumorFraction=0)
#'
#' datR <- buildSubclones(len=len, nbClones=K, bkps=bkps, regions=regions,
#'                        dataAnnotTP=dataAnnotTP, dataAnnotN=dataAnnotN)
#'
#' ## Simulated data
#' datS <- buildSubclones(len=len, nbClones=K, bkps=bkps, regions=regions)
#'
#' @importFrom jointseg getCopyNumberDataByResampling
#' @importFrom stats rnorm
#' @export
buildSubclones <- function(len, nbClones, bkps, regions, dataAnnotTP=NULL, dataAnnotN=NULL, eps=0.25) {
    if (nbClones != length(regions)) {
        stop("Argument 'nbClones' should match 'length(regions)'")
    }
    if (nbClones != length(bkps)) {
        stop("Argument 'nbClones' should match 'length(bkp)'")
    }
    if (is.factor(dataAnnotTP$region)) {
        dataAnnotTP$region <- as.character(dataAnnotTP$region)
    }
    genotype <- NULL; rm(list = "genotype")
    if (is.null(dataAnnotN)) {
        if (!is.null(dataAnnotTP)) {
            stop("Argument 'dataAnnotN' must be provided if argument 'dataAnnotTP' is")
        }
        ## assume equal repartition of AA, AB, BB for simplicity
        pct <- rep(1, times = 3L)/3
    } else {
        if (is.factor(dataAnnotTP$genotype)) {
            dataAnnotTP$genotype <- as.character(dataAnnotTP$genotype)
        }
        tbl <- table(dataAnnotN$genotype)
        pct <- tbl/sum(tbl)
    }
    genos <- sample(c(0, 0.5, 1), size=len, prob=pct, replace=TRUE)

    idxAA <- which(genos == 0)
    idxAB <- which(genos == 0.5)
    idxBB <- which(genos == 1)
    pos <- c(idxAB, idxAA, idxBB)

    if (is.null(dataAnnotTP)) {
        if (!is.null(dataAnnotN)) {
            warning("Argument 'dataAnnotN' will not be considered as 'dataAnnotTP' is NULL")
        }
        ## generate data from a simple Gaussian model based on region labels,
        ## assuming regions are coded as "(C1, C2)", as in the acnr package
        regs <- unique(unlist(regions))
        pattern <- "\\(([0-9]+),([0-9]+)\\)"
        C1 <- as.numeric(gsub(pattern, "\\1", regs))
        C2 <- as.numeric(gsub(pattern, "\\2", regs))
        if (anyNA(C1) || anyNA(C2)) {
            stop("Argument 'region' should be of the form '(C1, C2)', where C1 and C2 denote minor and major copy numbers")
        }

        n <- 5e3     ## number of data points to be sampled from

        CN1 <- rep(1, times = length(C))  ## for hets in the matched normal
        CN0 <- rep(0, times = length(C))  ## for homs in the matched normal
        CN2 <- rep(2, times = length(C))  ## for homs in the matched normal
        C0 <- rep(0, times = length(C))   ## for Homs
        C <- C1+C2                        ## for Homs
        
        ## Hets
        dat <- getDat(C1, C2, n, eps) 
        regz <- dat$region
        datN <- getDat(CN1, CN1, n, eps)  ## "matched normal"
        datAB <- data.frame(ct=dat$c, baft=dat$b, genotype=1/2, region=regz,
                       cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
        rm(list = c("dat", "datN"))
        
        ## symmetrization
        nr <- nrow(datAB)
        ww <- sample(nr, round(nr/2))
        datAB[ww, "baft"] <- 1-datAB[ww, "baft"]
        datAB[ww, "bafn"] <- 1-datAB[ww, "bafn"]

        ## Homs AA:  CA=C1+C2, CB=0
        dat <- getDat(C, C0, n, eps) 
        datN <- getDat(CN2, CN0, n, eps)  ## "matched normal"
        datAA <- data.frame(ct=dat$c, baft=dat$b, genotype=0, region=regz,
                       cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
        rm(list = c("dat", "datN"))
        
        ## Homs BB:  CA=0, CB=C1+C2
        dat <- getDat(C0, C, n, eps) 
        datN <- getDat(CN0, CN2, n, eps)  ## "matched normal"
        datBB <- data.frame(ct=dat$c, baft=dat$b, genotype=1, region=regz,
                       cn=datN$c, bafn=datN$b, stringsAsFactors=FALSE)
        rm(list = c("dat", "datN"))
    } else {
        keepCols <- c("c", "b", "genotype", "region")
        dataAnnot <- data.frame(dataAnnotTP[, keepCols],
                                dataAnnotN[, c("c", "b")])
        colnames(dataAnnot) <- c("ct", "baft", "genotype", "region", "cn", "bafn")
        
        ## split by genotype to generate several subclones with the same genotypes!
        datAA <- subset(dataAnnot, genotype == 0)
        datAB <- subset(dataAnnot, genotype == 0.5)
        datBB <- subset(dataAnnot, genotype == 1)
    }
    
    subClone <- lapply(seq_len(nbClones), FUN=function(ii) {
        bkp <- bkps[[ii]]
        reg <- regions[[ii]]
        

        bkpsAB <- sapply(bkp, FUN=function(bb) max(which(idxAB <= bb)))
        sim <- getCopyNumberDataByResampling(length=length(idxAB),
                                             regData=datAB, bkp=bkpsAB,
                                             regions=reg)
        ssAB <- sim$profile

        bkpsAA <- sapply(bkp, FUN=function(bb) max(which(idxAA <= bb)))
        sim <- getCopyNumberDataByResampling(length=length(idxAA),
                                             regData=datAA, bkp=bkpsAA,
                                             regions=reg)
        ssAA <- sim$profile

        bkpsBB <- sapply(bkp, FUN=function(bb) max(which(idxBB <= bb)))
        sim <- getCopyNumberDataByResampling(length=length(idxBB),
                                             regData=datBB, bkp=bkpsBB,
                                             regions=reg)
        ssBB <- sim$profile

        ss <- rbind(ssAB, ssAA, ssBB)
        ss$pos <- pos
        o <- order(pos)
        ss[o, , drop=FALSE]
    })
    names(subClone) <- seq_along(subClone)
    subClone
}

## local function to generate toy data on the C1,C2 scale
getDat <- function(C1, C2, n, eps) {
    regs <- sprintf("(%s,%s)", pmin(C1, C2), pmax(C1, C2))  ## back to TRUE C1,C2 (if required)
    regz <- rep(regs, each=n)
    mean1 <- rep(C1, each=n)
    mean2 <- rep(C2, each=n)
    
    ntot <- n*length(regs)
    c1 <- exp(rnorm(ntot, mean=log(1 + mean1), sd=eps) - 1)  ## exp(log(1+.)-1) to enforce positivity
    c2 <- exp(rnorm(ntot, mean=log(1 + mean2), sd=eps) - 1)  ## exp(log(1+.)-1) to enforce positivity
    c <- c1 + c2
    d <- abs(c2 - c1)/c  ## absolute value implicitly enforces c1 < c2
    b <- 1/2 + (c2 - c1)/c/2
    dat <- data.frame(c=c, b=b, region=regz, stringsAsFactors=FALSE)
}