tempoSig: Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

Documented in dinucleotides maf2cat plotExposure senspec trinucleotides writeExposure

#' Trinucleotide labels
#'
#' @export
trinucleotides <- function(nt = 'ACGT', brackets = TRUE, arrows = TRUE){

  nt2 <- strsplit(nt, split='')[[1]]
  pyr <- nt2[nt2 %in% c('C', 'T')]
  w <- expand.grid(nt2, pyr, stringsAsFactors = FALSE)
  w <- w[,c(2,1)]
  w <- w[w[, 1]!=w[,2],]
  sp <- ifelse(arrows, '>', '')
  wx <- apply(w, 1, function(x){paste(c(x[1], sp, x[2]),collapse='')})

  z <- expand.grid(nt2, nt2, wx, stringsAsFactors = FALSE)
  z <- z[, c(2, 3, 1)]
  if(brackets) sp <- c('[',']')
  else sp <- c('','')
  w <- apply(z, 1, function(x){paste(c(x[1], sp[1], x[2], sp[2], x[3]), collapse = '')})
  names(w) <- NULL
  return(w)
}

#' Dinucleotide labels
#' 
#' @export
dinucleotides <- function(nt = 'ACGT'){
  
  nt2 <- strsplit(nt, split='')[[1]]
  src <- c('AC','AT','CC','CG','CT','GC','TA','TC','TG','TT') # source dinucleotide
  dbs <- NULL
  for(i in seq_along(src)){
    si <- src[i]
    sn <- strsplit(si,split='')[[1]]
    for(x in nt2[nt2!=sn[1]]) for(y in nt2[nt2!=sn[2]]){
      tgt <- c(x,y)
      sn2 <- rcmp(sn)
      tg2 <- rcmp(tgt)
      db <- paste(c(si,'>',tgt),collapse='')
      db2 <- paste(c(sn2,'>',tg2),collapse='')
      if(!db %in% dbs & !db2 %in% dbs){
        if(i %in% c(4,7)) dbs <- c(dbs, db2)
        else dbs <- c(dbs, db)
      }
    }
  }
  return(dbs)
}

rcmp <- function(x){
  
  w <- x
  w[x=='A'] <- 'T'
  w[x=='C'] <- 'G'
  w[x=='G'] <- 'C'
  w[x=='T'] <- 'A'
  
  return(rev(w))
}
#' 
#' Plot exposure
#'
#' Display barplot of exposure extracted for a specific sample
#'
#' @param object Object of class \code{tempoSig}
#' @param sample.id Sample index or name (\code{Tumor_Sample_Barcode}) to display
#' @param cutoff Minimum proportion for displaying signature labels
#' @param ... Other parameters to \code{barplot}
#' @return None
#' @export
plotExposure <- function(object, sample.id, cutoff = 1e-3, ...){

  if(!is(object, 'tempoSig')) stop('Object is not of class tempoSig')
  sname <- names(tmb(object))
  if(is.character(sample.id)){
    sample.id <- which(sname == sample.id)
    if(length(sample.id)==0) stop(paste0(sample.id, 'is not in object'))
  }
  sample.id <- as.integer(sample.id)
  expo <- expos(object)
  if(sample.id < 1 | sample.id > NROW(expo)) stop('sample.id out of bound in object')
  e <- expo[sample.id,]
  e <- e[e >= cutoff]
  names.arg <- names(e)
  names.arg[e < cutoff] <- ''
  graphics::barplot(e, main = sname[sample.id], las=2, names.arg = names.arg,
                    ylab = 'Proportions', ...)

  return(invisible(e))
}

#' Write Exposure
#'
#' Save a text output of exposures
#'
#' Writes a text file of specified name.
#'
#' @param object Object of class \code{tempoSig}
#' @param output File name of the output
#' @param sep Delimiter, either space or tab.
#' @param rm.na Remove rows with NAs (mutation load below minimum)
#' @param pv.out File name for p-value output. If \code{NULL}, \code{output}
#'        file is written with alternating observed and p-value columns.
#' @param cBio.format File output in cBioPortal \code{Generic Assay} format; 
#'        only works with \code{pv.out != NULL}
#'
#' @export
writeExposure <- function(object, output, sep = '\t', rm.na = FALSE, pv.out = NULL,
                          cBio.format = FALSE){

  if(!is(object, 'tempoSig')) stop('Object is not of class tempoSig')
  if(!is.character(output)) stop('Output file name must be characters')
  if(!sep %in% c(' ','\t')) stop('Delimiter must be either space or tab')
  expo <- expos(object)
  if(all(dim(expo) == 0)) stop('Exposure in object empty')
  if(cBio.format & is.null(pv.out)) stop('cBio.format requires pv.out')
 
  bad <- apply(expo, 1, function(x){all(is.na(x))})
  if(rm.na){
    if(sum(!bad)==0) stop('All samples are NA and rm.na = TRUE')
    expo <- expo[!bad, , drop = FALSE]
    tmba <- tmb(object)[!bad]
  } else
    tmba <- tmb(object)

  is.pv <- !all(dim(pvalue(object)) == 0)   # pvalue is not empty
  out0 <- data.frame(Tumor_Sample_Barcode = rownames(expo), TMB = tmba)
  
  if(!is.pv | !is.null(pv.out))  # no pvalue or separate output
    out <- cbind(out0, as.data.frame(expo))
  else out <- out0
  
  if(is.pv){
    if(!is.null(pv.out)) pout <- out0
    pv <- pvalue(object)
    if(rm.na) pv <- pv[!bad, , drop = FALSE]
    sig.names <- colnames(expo)
    for(k in seq(NCOL(expo))){
      if(is.null(pv.out)){
        tmp <- data.frame(expo[,k], pv[,k])
        names(tmp) <- paste0(sig.names[k], c('.observed','.pvalue'))
        out <- cbind(out, tmp)
      } else{
        tmp <- data.frame(pv[,k])
        names(tmp) <- sig.names[k]
        pout <- cbind(pout, tmp)
      }
    }
  }
  
  if(!cBio.format){
    colnames(out)[1:2] <- c('Sample Name', 'Number of Mutations') # compatibility
    if(!is.null(pv.out)) colnames(pout)[1:2] <- colnames(out)[1:2]
  } else{
    if(sum(is.na(rownames(out)))) 
      rownames(out) <- out[,2]
    out <- out[,-2, drop = FALSE]   # remove no. of mutation column
    if(colnames(out)[2]=='Signature.1')  # v2
      annot <- read.csv(system.file('extdata', 'msig_cBioPortal_v2.csv', package = 'tempoSig'))
    else if(colnames(out)[2]=='SBS1')  # v3
      annot <- read.csv(system.file('extdata', 'msig_cBioPortal_v3.csv', package = 'tempoSig'))
    else
      stop('cBioFormat not compatible with custom reference')
    
    idx <- match(colnames(out)[-1], annot[,1])
    out <- cbind(data.frame(
            ENTITY_STABLE_ID = paste('mutational_signature_contribution', annot[,1], sep='_'),
            NAME = annot[,2], 
            DESCRIPTION = annot$Description,
            URL = annot$URL), t(as.matrix(out[,-1]))[idx,])
    if(exists('pout')){
      if(sum(is.na(rownames(pout)))) 
        rownames(pout) <- pout[,2]
      pout <- pout[,-2, drop = FALSE]   # remove no. of mutation column
      pout <- cbind(data.frame(
        ENTITY_STABLE_ID = paste('mutational_signature_pvalue', annot[,1], sep='_'),
        NAME = annot[,2], 
        DESCRIPTION = annot$Description,
        URL = annot$URL), t(as.matrix(pout[,-1]))[idx,])
    }
  }
  
  write.table(out, file = output, sep = sep, row.names = F, quote = F)
  if(!is.null(pv.out)) write.table(pout, file = pv.out, sep = sep, row.names = F,
                                   quote = F)
  return(invisible(object))
}

#' Generate Mutation Catalog from MAF file
#' 
#' Input is MAF file and the corresponding trinucleotide catalog matrix is generated
#' 
#' @param maf MAF file name or data frame. It must contain a column named \code{Ref_Tri} 
#'        containing the trinucleotide sequences surrounding the mutation site.
#'        If the mutation site reference allele is \code{A, G}, the \code{Ref_Tri}
#'        is the complement sequence of the trinucleotides, such that its central
#'        allele is always \code{C, T}.
#' @param progress.bar Display progress bar
#'        
#' @return Count matrix with mutation contexts in rows and \code{Tumor_Sample_Barcode}
#'         in columns.
#' @export
maf2cat <- function(maf, progress.bar = TRUE){
  
  if(is(maf, 'character')){ 
    if(!file.exists(maf)) stop(paste0('File ', maf, ' does not exist'))
    x <- data.table::fread(maf)
  } else x <- maf
  x <- as.data.frame(x)
  col <- colnames(x)
  if(!'Ref_Tri' %in% col) stop('Column Ref_Tri does not exist in MAF')
  if('End_position' %in% col){
    col[col=='End_position'] <- 'End_Position'
    colnames(x) <- col
  }
  if('Variant_Type' %in% colnames(x))
    x <- x[x$Variant_Type == 'SNP', ]
  
  x <- x[, c('Tumor_Sample_Barcode', 'Chromosome', 'Start_Position', 'End_Position',
           'Reference_Allele', 'Tumor_Seq_Allele2', 'Ref_Tri')] 
  x <- x[!duplicated(x) & x$Ref_Tri != '',]
  
  m <- NROW(x)
  mut <- rep('', m)
  if(m > 0){
    if(progress.bar) pb <- txtProgressBar(style = 3)
    for(i in seq(m)){
       w <- c(t(x[i,c('Reference_Allele', 'Tumor_Seq_Allele2')]), 
            strsplit(as.character(x[i, 'Ref_Tri']), split='')[[1]])
       if(w[1] %in% c('A','G')) 
         w[1:2] <- ntdag(w[1:2])  # Ref_Tri is already pyrimidine at the center
       mut[i] <- paste(c(w[3], '[', w[1], '>', w[2], ']', w[5]), collapse='')
       if(progress.bar) setTxtProgressBar(pb, i/m)
    }
    if(progress.bar) close(pb)
    mut <- factor(mut, levels=trinucleotides(), ordered = TRUE)
    tmut <- table(mut, x$Tumor_Sample_Barcode)
    tmut <- as.data.frame.matrix(tmut)
  } else{
    tmut <- data.frame(na=rep(0, 96))
    rownames(tmut) <- trinucleotides()
  }
  
  return(tmut)
}

# nucleotide complement
ntdag <- function(nt){
  
  z <- nt
  for(i in seq_along(nt)){
    if(nt[i]=='A') z[i] <- 'T'
    else if(nt[i]=='T') z[i] <- 'A'
    else if(nt[i]=='G') z[i] <- 'C'
    else if(nt[i]=='C') z[i] <- 'G'
    else stop('Unknown nucleotide in maf file')
  }
  return(z)
}

#' True and false positive rates
#' 
#' From supplied exposure prediction and actual vectors, compute 
#' true and false positive rates
#' @param xhat Predicted exposure vector
#' @param x True exposure vector
#' @param pvalue Vector of prediction p-values
#' @param alpha False positive rate threshold
#' @return list of vectors \code{tp} (true positives), \code{fp} (false positives),
#'         all positives \code{positives} and negatives \code{negatives} in
#'         names of \code{xhat}
#' @export
senspec <- function(xhat, x, pvalue = NULL, alpha = 0.05){
  
  if(is.null(names(xhat)) | is.null(names(x)))
    stop('Input vectors must have names')
  if(!is.null(pvalue)){ 
    if(sum(is.null(names(pvalue))) > 0) stop('pvalues must have names')
    if(length(xhat) != length(pvalue)) stop('xhat and pvalue have different lengths')
    if(!all(names(xhat)==names(pvalue))) stop('xhat and pvalue names mismatch')
  }

  sig <- union(names(xhat), names(x))
  vsig <- rep(0, length(sig))
  names(vsig) <- sig
  x <- vectorPad(x, vsig, value=0)
  xhat <- vectorPad(xhat, vsig, value=0)
  if(!is.null(pvalue)) pvalue <- vectorPad(pvalue, vsig, value=1)

  bhat <- xhat > 0
  if(!is.null(pvalue))           # filter with pvalues for significant subset
    bhat <- bhat & (pvalue <= alpha)
  b <- x > 0
# tpr <- sum(bhat & b) / sum(b)  # true positive rate
# fpr <- sum(bhat & !b) /sum(!b) # false positive rate
  tp <- names(bhat)[bhat & b]
  fp <- names(bhat)[bhat & !b]
  positives <- names(bhat)[b]
  negatives <- names(bhat)[!b]
  
  return(list(tp=tp, fp=fp, positives=positives, negatives=negatives))
}

# Expand named vector x to match xref names and order
vectorPad <- function(x, xref, value = 0){
  
  x2 <- rep(value, length(xref))
  names(x2) <- names(xref)
  x2[names(x)] <- x
  return(x2[match(names(xref), names(x2))])
}

mskcc/tempoSig documentation built on Feb. 3, 2023, 8:35 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

mskcc/tempoSig
Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

R/utilities.R
In mskcc/tempoSig: Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

Defines functions vectorPad senspec ntdag maf2cat writeExposure plotExposure rcmp dinucleotides trinucleotides

Documented in dinucleotides maf2cat plotExposure senspec trinucleotides writeExposure

R Package Documentation

Browse R Packages

We want your feedback!

mskcc/tempoSig Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

R/utilities.R In mskcc/tempoSig: Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

Defines functions vectorPad senspec ntdag maf2cat writeExposure plotExposure rcmp dinucleotides trinucleotides

Documented in dinucleotides maf2cat plotExposure senspec trinucleotides writeExposure

R Package Documentation

Browse R Packages

We want your feedback!

mskcc/tempoSig
Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures

R/utilities.R
In mskcc/tempoSig: Maximum Likelihood Fitting of Mutational Catalogs and De Novo Inference of Signatures