c3co: Cancer Cell Clonality using Copy-Number

Documented in mixSubclones

#' Create a mixture of subclones
#'
#' @param subClones A list of K subclones, as provided by [buildSubclones()].
#'   Each subclone is a data.frame with the following columns: \describe{
#'   
#' \item{ct}{total copy number in the tumor}
#' \item{baft}{B allele fraction in the tumor}
#' \item{genotype}{germline genotype in 0, 0.5, 1}
#' \item{cn}{total copy number in the matched normal}
#' \item{bafn}{B allele fraction in the matched normal}}
#'
#' @param W A n x K matrix of weights in \eqn{[0,1]}, as generated by
#'   [rSparseWeightMatrix()]. 
#'
#' @return A list of n samples obtained by mixing the pure subclones according
#'   to the proportions specified in argument W.
#'
#' @details The mixture is performed at the scale of minor and major copy
#'   numbers (which is the scale at which copy number events occur). Therefore
#'   the data is first mapped to this scale, then mixed, then mapped back to the
#'   original (tcn, dh) scale. 
#'   
#'   By construction, dh, minor and major CN are only defined for heterozygous
#'   SNPs. Therefore, in the output data, the corresponding columns are missing
#'   for homozygous SNPs. Total copy numbers are *not* missing, so it makes
#'   sense to keep the homozygous SNPs in the output data as they will be
#'   exploited at the segmentation step.
#'
#' @examples
#' dataAnnotTP <- acnr::loadCnRegionData(dataSet="GSE11976", tumorFrac=1)
#' dataAnnotN <- acnr::loadCnRegionData(dataSet="GSE11976", tumorFrac=0)
#' len <- 500*10  ## Number of loci
#' K <- 2L        ## Number of subclones
#' n <- 3L        ## Number of samples
#' bkps <- list(c(100, 250)*10, c(150, 400)*10)
#' regions <- list(c("(0,3)", "(0,2)", "(1,2)"), c("(1,1)", "(0,1)", "(1,1)"))
#' datSubClone <- buildSubclones(len=len, nbClones=K, bkps=bkps, regions=regions,
#'                               dataAnnotTP=dataAnnotTP, dataAnnotN=dataAnnotN)
#' W <- rSparseWeightMatrix(nb.samp=n, nb.arch=K)
#' mixture <- mixSubclones(datSubClone, W=W)
#'
#' @export
mixSubclones <- function(subClones, W) {
    ## sanity checks: dimensions
    stop_if_not(length(dim(W)) == 2L)
    stop_if_not(length(subClones) == ncol(W))
    
    ## sanity check: column names
    expectedNames <- c("ct", "baft", "genotype", "cn", "bafn", "pos")
    lapply(subClones, FUN = function(dat){
        mm <- match(expectedNames, colnames(dat))
        if (any(is.na(mm))) {
            lost <- paste(expectedNames[which(is.na(mm))], collapse = ", ")
            stop("Missing column(s) in argument 'subClones': ", lost)
        }
    })
    
    ## sanity check: weights
    if (any(W < 0)) {
        stop("Expecting non-negative weight matrix")
    }
    if (any(rowSums(W) > 1L)) {
        stop("Total tumor fraction larger than 1, please make sure that the row sums of 'W' are at most 1")
    }
        
    idxHom <- which(subClones[[1]]$genotype != 0.5)
    sc <- sapply(seq_len(length(subClones)-1), FUN = function(i) {
        # Test if genotype of i is equal to genotype of j
        genoI <- subClones[[i]]$genotype
        for (j in (i+1):length(subClones)) {
            genoJ <- subClones[[j]]$genotype
            if (sum(genoI == genoJ) != length(genoI)) {
                stop(sprintf("Genotypes are not identical for sublones %s and %s", i, j))
            }
        }
    })

    ## compute parental copy numbers
    c1t <- sapply(subClones, FUN=function(ss) {
        dh <- 2*abs(ss$baft-1/2)
        c <- ss$ct*(1-dh)/2
    })

    c1n <- sapply(subClones, FUN=function(ss) {
        dh <- 2*abs(ss$bafn-1/2)
        c <- ss$cn*(1-dh)/2
    })
    c2t <- sapply(subClones, FUN=function(ss) {
        dh <- 2*abs(ss$baft-1/2)
        c <- ss$ct*(1+dh)/2
    })
    c2n <- sapply(subClones, FUN=function(ss) {
        dh <- 2*abs(ss$bafn-1/2)
        c <- ss$cn*(1+dh)/2
    })
    
    df.res <- apply(W, MARGIN=1L, FUN=function(weights) {
        fracN <- 1 - sum(weights)
        stop_if_not(fracN >= 0) ## should have been caught earlier by above sanity checks
        c1 <- rowSums(cbind(sapply(seq_along(subClones), FUN=function(ii) {
            weights[ii]*c1t[, ii]
        }), fracN*rowMeans(c1n)))
        c2 <- rowSums(cbind(sapply(seq_along(subClones), FUN=function(ii) {
            weights[ii]*c2t[, ii]
        }), fracN*rowMeans(c2n)))
        tcn <- c1+c2

        c1[idxHom] <- NA_real_
        c1[idxHom] <- NA_real_
        c2[idxHom] <- NA_real_
        c2[idxHom] <- NA_real_
        dh <- (c2-c1)/(c2+c1)
        dh[idxHom] <- NA_real_
        data.frame(c1=c1, c2=c2, tcn=tcn, dh=dh,
                   genotype=subClones[[1]]$genotype,
                   chr=rep(1, times=length(c1)),
                   pos=seq_along(c1))
    })

    df.res
}