Butte: BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

Documented in cnmutData mergeCNA scnaInput scnaTiming ssnvInput

#' estimate the initiation and arrival time for each SCNA segment given a file containing multiple SCNA segments, and a file containing SSNV data.
#' 
#'
#' @param scnaFile the SCNA segmentation file
#' @param ssnvFile the SSNV file
#' @param sn sample name
#' @param outname output timing file name (including path)
#' @param public somatic timeline is focused on public mutations (across multi-samples) or not (otherwise, clonal variants in the specific sample)
#' @param pubOrSub the colname for column indicating if the mutation is public or not
#' @param mmut minimum number of mutations for running timing analysis
#' @param qmethod the method for estimating q (probabilities of a randomly acquired mutation having allele state of aj/Nt)
#' @param skipchunk segments with number of data points (probes) no more than this number will be skipped, set it to zero if not skipped
#' @param B number of bootstrap for calculating confidence interval
#' @param purity the purity of the tumor sample. if not specified, the program will search in the ssnv file for the column with the header as sn"pu" (see example input)
#' @return list: timing result; timing table (for visualization) and merged CNA data frame. For butte cases (non-identifiable), pi[1] is the lower bound, and pi[2] is the upper bound. piCI[1,] and piCI[2,] are the bootstrapped confidence interval for the two bounds, respectively.
#' @export
scnaTiming <- function(scnaFile, ssnvFile, sn, outname, public=FALSE, pubOrSub="pubOrSub",
                      skipchunk = 19, mmut=10, qmethod="fullMLE", B=100, purity=NA) {

    timingInput = cnmutData(scnaFile=scnaFile, ssnvFile=ssnvFile, skipchunk=skipchunk)
    cnvA2   = timingInput[[1]]
    sampAB  = timingInput[[2]]
    cnvHits = timingInput[[3]]
    snvHits = timingInput[[4]]
    
    #produce results
    result = list()
    resultTable = vector()
    li = 1
    for (i in 1:dim(cnvA2)[1]) {
        cnchrom = as.character(cnvA2$chrom[i])
        cnstart = cnvA2$loc.start[i]
        cnend = cnvA2$loc.end[i]
        cnLOHcall = as.character(cnvA2$LOHcall[i])
        cnminor = cnvA2$minor_cn[i]
        cnmajor = cnvA2$major_cn[i]
        cntotal = cnvA2$copynumber[i]
        message(paste(cnchrom, cnstart, cnend, cntotal, cnminor, sep="\t"))

        #skip SCNAs with Nt >= 8
        if (cntotal >= 8 | (cntotal == 2 & cnminor == 1) | (cntotal <= 1 & cnminor == 0)) {
            message(paste("skip cnv", i, paste0(cntotal, ":", cnminor), sep=" "))
            next
        }
        
        # chop data to study the relevant ssnvs inside the SCNA region
        chopped = sampAB[snvHits[which(cnvHits == i)],]
        if (public == TRUE) {    #focus on the truncal ones to multi samples
            chopped = chopped[which(chopped[,pubOrSub] == "public"),]                                    
        } else {                 #focus on the clonal ssnvs in the corresponding sample
            chopped = chopped[which(chopped[,paste(sn,"ccf",sep="")] +
                                    1.96*chopped[,paste(sn,"ccfSD",sep="")] >= 1),]
        }

        #skip SCNAs containing less than mmut mutations
        if (dim(chopped)[1] < mmut) {
            message(paste("skip cnv", i, "< min # of mutations", dim(chopped)[1], sep=" "))
            next
        }

        if (is.na(purity)) {  #if purity is not defined
          purity = chopped[,paste(sn,"pu",sep="")][1]
        }
        
        onlyMuts = data.frame(chromosome=chopped$chr, position=chopped$pos,
                              refbase=chopped$ref, mutbase=chopped$alt, rsID=chopped$id,
                              t_ref_count=chopped[,paste(sn,"refc",sep="")],
                              t_alt_count=chopped[,paste(sn,"altc",sep="")],
                              allelefreq=chopped[,paste(sn,"mafc",sep="")],
                              t_depth=chopped[,paste(sn,"refc",sep="")] +
                                  chopped[,paste(sn,"altc",sep="")])
                              #t_depth=chopped[,paste(sn,"d",sep="")])
        
        # SCNA has two groups
        # group1: one group with an unique and identifiable history matrix A
        hmatrix = cnmutHistory(nt = cntotal, nb = cnminor)
        if (length(hmatrix) == 1 & dim(hmatrix[[1]])[1] == dim(hmatrix[[1]])[2])  #2:0 3:1 3:0 4:1
            cntype = "identifiable"
        
        # group2: the other does not have identifiable history, or there are mulitple A
        else
            cntype = "butte"
        message(cntype)

        # call Butte for estimating timing
        x <- Butte(x=onlyMuts$t_alt_count, m=onlyMuts$t_depth, history=hmatrix, qmethod=qmethod,
                   nt=cntotal, nb=cnminor, minMutations=mmut, type=cntype,
                   purity=purity, bootstrapCI="bootstrap", B=B)
        
        x = c(x, cnid=i, cnchrom=cnchrom, cnstart=cnstart, cnend=cnend,
              cnmajor=cnmajor, cnminor=cnminor, cnLOHcall=cnLOHcall)

        result[[li]] = x
        names(result)[[li]] = i

        K = length(x$pi)
        currentline = c(cnchrom, cnstart, cnend, cnmajor, cnminor, x$summaryTable[1], cntype,
                        as.numeric(x$pi[1]), as.numeric(x$piCI[1,1]), as.numeric(x$piCI[1,2]),      #time of the first stage
                        as.numeric(x$pi[K]), as.numeric(x$piCI[K,1]), as.numeric(x$piCI[K,2]))      #time of the last stage
        
        names(currentline) = c("chrom", "loc.start", "loc.end", "major_cn", "minor_cn", "nmut", "type",
                               "p0","p0l","p0h","pK","pKl","pKh")
        
        message(paste(currentline, collapse=" "))
        if (length(resultTable) == 0) {
            resultTable = currentline
        } else {
            # combine the result into a data.frame
            resultTable = rbind(resultTable, currentline)
        }
        li=li+1
    }
    resultTable = data.frame(resultTable)
    return(list(result, resultTable, cnvA2))    
}


#' Reading and sorting scnaFile 
#' 
#'
#' @param scnaFile the SCNA segmentation file
#' @param skipchunk segments with number of data points (probes) no more than this number will be skipped, set it to zero if not skipped
#' @return sorted scna segmentation data frame
#' @importFrom dplyr arrange
#' @export
scnaInput <- function(scnaFile, skipchunk=19) {
    scna = scnaFile
    #smooth the CN profie according to the minimum segment size (to skip)
    scna = mergeCNA(cnFile = scnaFile, skipchunk = skipchunk)
    #sort by chr and coordinates
    scna = dplyr::arrange(scna, chrom, loc.start, loc.end)
    return(scna)    
}


#' Reading and sorting ssnvFile 
#' 
#'
#' @param ssnvFile the SSNV file
#' @return sorted ssnv data frame
#' @importFrom dplyr arrange
#' @export
ssnvInput <- function(snvFile) {
    if (! file.exists(snvFile)) {
        stop("ssnvFile needs to be provided!")
    }
    ssnv = read.delim(snvFile, stringsAsFactors=FALSE, check.names=FALSE)
    #sort by chr and coordinates
    ssnv = dplyr::arrange(ssnv, chr, pos)
    return(ssnv)
}


#' Generate the SCNA SSNV input for running butte 
#'
#' Four elements will be generated in the output list
#' merged CNA segmentation;
#' SSNV data.frame;
#' cnvHits (index in CNA file, overlapping ssnv)
#' snvHits (index in SSNV file, overlapping with cnv)
#' 
#'
#' @param scnaFile the SCNA segmentation file 
#' @param ssnvFile the SSNV file
#' @param skipchunk segments with number of data points (probes) no more than this number will be skipped, set it to zero if not skipped 
#' @return list of data input for running butte
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges findOverlaps
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors queryHits
#' @importFrom S4Vectors subjectHits
#' @export
cnmutData <- function(scnaFile, ssnvFile, skipchunk = 19) {

    cnvA2 = scnaInput(scnaFile, skipchunk=skipchunk)
    sampAB = ssnvInput(ssnvFile)
    
    #overlap between SCNA and SSNV
    #first make it consistent the chromosome prefix
    cnvSeqNames = as.character(cnvA2$chrom)
    if ( !grepl("chr", cnvA2$chrom[1]) ) {
        cnvSeqNames = paste("chr", cnvA2$chrom, sep="")
    }
    message(cnvSeqNames[1])

    snvSeqNames = sampAB$chr
    if ( !grepl("chr", sampAB$chr[1]) ) {
        snvSeqNames = paste("chr", sampAB$chr, sep="")
    }
    message(snvSeqNames[1])

    #find overlaps
    cnvRangeA = GenomicRanges::GRanges(seqnames = cnvSeqNames,
                                       ranges = IRanges::IRanges(cnvA2$loc.start, end=cnvA2$loc.end),
                                       strand=rep('+',dim(cnvA2)[1]))
    snvRange = GenomicRanges::GRanges(seqnames = snvSeqNames,
                                      ranges = IRanges::IRanges(sampAB$pos, end=sampAB$pos),
                                      strand=rep('+',dim(sampAB)[1]))
    foA = GenomicRanges::findOverlaps(cnvRangeA, snvRange)
    cnvHits = S4Vectors::queryHits(foA)
    snvHits = S4Vectors::subjectHits(foA)
    return(list(cnvA2, sampAB, cnvHits, snvHits))

}







#' Merge the CNA by jumping (neglecting) small segments 
#'
#' When CNA segmentation contains many small segments, one may want to
#' merge the two neighboring segments by skipping the small segment.  
#'
#' @param cnFile the SCNA segmentation file 
#' @param skipchunk segments with number of data points (probes) no more than this number will be skipped, set it to zero if not skipped 
#' @param correctMale logical, whether or not divide by 2 for the X chromosome (testing)
#' @return data frame of the merged CNA segmentation
#' @export
mergeCNA <- function(cnFile, skipchunk = 19, correctMale = FALSE) {

    if (! file.exists(cnFile)) {
        stop("cnFile (titan segmentation file) not found!")
    }
    message(cnFile)
    cnvA = read.delim(cnFile, stringsAsFactors=FALSE, check.names=FALSE)
    
    if (skipchunk < 1) {  #if skipchunk is not defined, then return and stop here
        return(cnvA)
    }

    #here we go to the merging
    cnvA = cnvA[which(cnvA$num.mark > skipchunk),]                               #skip two few marks
    cp2 <- c(which(cnvA$logcopynumberratio[-1] != cnvA$logcopynumberratio[-nrow(cnvA)] |
                       cnvA$chrom[-1] != cnvA$chrom[-nrow(cnvA)] |
                           cnvA$LOHcall[-1] != cnvA$LOHcall[-nrow(cnvA)]),
             nrow(cnvA))
    cp1 <- c(1,cp2[-length(cp2)]+1)
    cnvA2 <- data.frame(chrom=cnvA$chrom[cp1],
                        loc.start=cnvA$loc.start[cp1],
                        loc.end=cnvA$loc.end[cp2],
                        num.mark=cnvA$num.mark[cp1],         #recal later
                        seg.mean=cnvA$seg.mean[cp1],         #recal later
                        copynumber=cnvA$copynumber[cp1],
                        minor_cn=cnvA$minor_cn[cp1],
                        major_cn=cnvA$major_cn[cp1],
                        allelicratio=cnvA$allelicratio[cp1], #recal later
                        LOHcall=cnvA$LOHcall[cp1],
                        cellularprevalence=cnvA$cellularprevalence[cp1],
                        ploidy=cnvA$ploidy[cp1],
                        normalproportion=cnvA$normalproportion[cp1],
                        logcopynumberratio=cnvA$logcopynumberratio[cp1])
    for (j in 1:length(cp1)) {
        cnvA2$num.mark[j] <- sum(cnvA$num.mark[cp1[j]:cp2[j]])
        cnvA2$seg.mean[j] <- mean(cnvA$seg.mean[cp1[j]:cp2[j]])
        cnvA2$allelicratio[j] <- mean(cnvA$allelicratio[cp1[j]:cp2[j]])
    }
    if (correctMale) {   #testing purpose only, for chromosomeX, divide by 2
        cnvA2[which(as.character(cnvA2$chrom == "X")), "copynumber"] =
            cnvA2[which(as.character(cnvA2$chrom == "X")), "copynumber"]/2
        cnvA2[which(as.character(cnvA2$chrom == "X")), "major_cn"] =
            cnvA2[which(as.character(cnvA2$chrom == "X")), "major_cn"]/2
    }
    return(cnvA2)
}

SunPathLab/Butte documentation built on Sept. 19, 2023, 9:42 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SunPathLab/Butte
BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

R/cnmut.R
In SunPathLab/Butte: BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

Defines functions mergeCNA cnmutData ssnvInput scnaInput scnaTiming

Documented in cnmutData mergeCNA scnaInput scnaTiming ssnvInput

R Package Documentation

Browse R Packages

We want your feedback!

SunPathLab/Butte BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

R/cnmut.R In SunPathLab/Butte: BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

Defines functions mergeCNA cnmutData ssnvInput scnaInput scnaTiming

Documented in cnmutData mergeCNA scnaInput scnaTiming ssnvInput

R Package Documentation

Browse R Packages

We want your feedback!

SunPathLab/Butte
BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).

R/cnmut.R
In SunPathLab/Butte: BoUnds of Time Till Expansion, inferring the SCNA arrival time in the somatic evolution toward the most recent common ancestor of tumor sample(s).