######################################################################################
# CRG
# Hana SUSAK
#------------------------------------------------------------------------------------
# background model functions
######################################################################################
# Function to assign columns in case no header is provided or naming is not standard
# (better practice would be to already have named columns as specified)
# @param sample.mutations - data frame to which column will be given a hedaer
# @param column - integer/numeric value for each column assignment of header
# @param columnName - character value, new header for the columns
assign.columns <- function(sample.mutations, column, columnName){
if (is.numeric(column)){
column <- as.integer(column)
}
if (!is.integer(column)){
stop(paste(column," must be integer (or numeric which will be cast to integer)"), call. = FALSE)
}
if (length(column) > 1) {
stop("column must be a single integer/numeric variable", call. = FALSE)
}
colnames(sample.mutations)[column] <- as.character(columnName)
return(sample.mutations)
}
# Function to take only exonic mutations
# @param sample.mutations - data frame to which filtration is done
# @param Variant_Classification Variant_Classification (optional) integer/numeric value indicating column in \code{sample.mutations} which contain classification for SNV .
# Default is NULL value (in this case \code{sample.mutations} should already have this column)
exonic.only <- function(sample.mutations, Variant_Classification=NULL){
if (!is.null(Variant_Classification)){
sample.mutations <- assign.columns(sample.mutations, Variant_Classification, "variant_classification")
}
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
# take only exonic
num.del <- nrow(sample.mutations[!sample.mutations$variant_classification %in% c('Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 'In_Frame_Ins',
'Missense_Mutation', 'Nonsense_Mutation', 'Silent',
'Splice_Site', 'Translation_Start_Site', 'Nonstop_Mutation'), ])
if (num.del ){
warning(paste(num.del, "mutations excluded from analysis as they are classified as one of: 3'UTR, 3'Flank, 5'UTR, 5'Flank, IGR , Intron, RNA, Targeted_Region"))
sample.mutations <- sample.mutations[sample.mutations$variant_classification %in% c('Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 'In_Frame_Ins',
'Missense_Mutation', 'Nonsense_Mutation', 'Silent',
'Splice_Site', 'Translation_Start_Site', 'Nonstop_Mutation'), ]
}
colnames(sample.mutations)[1:num.col] <- original.col.names
return(sample.mutations)
}
#' Background mutation rate estimated from the observed silent mutations.
#' @description
#' \code{bcgr} function calculates the background probability that a gene is mutated based on the frequency of silent mutations.
#' @param sample.mutations data frame in MAF like format with nonsilent and silent mutations.
#' Columns names/header in \code{sample.mutations} must be:
#' \itemize{
#' \item Variant_Classification : column specified in the MAF format, which distinguishes between silent and nonsilent SNVs
#' \item Hugo_Symbol : column specified in the MAF format, which reports the gene name for each SNV.
#' \item Tumor_Sample_Barcode : column specified in the MAF format, which reports in wich patient the SNV was found.
#' \item CCF : numeric column produce by \code{CCF} function, or calculated previously for each SNV.
#' }
#' @param genes vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then list of unique genes is taken from \code{sample.mutations}.
#' @param Variant_Classification (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the classification for the SNVs (Silent or not).
#' Default is NULL value (in this case \code{sample.mutations} should already have this column).
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Hugo_Symbol (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the gene names for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Tumor_Sample_Barcode (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the sample ids for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param CCF (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the cancer cell fraction information for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @details Assuming neutral selection, the function estimates the expected number of nonsilent mutations from observed number of silent mutations.
#' Na (number of all possible nonsilent substitutions) and Ns (number of all possible silent substitutions) were taken from Lawrence paper.
#' They are provided in this package in the file lawrence.RData.
#' When the expected number of nonsilent mutations for each gene is known, the probability to get a nonsilent mutation in each gene is calculated.
#' This is based on
#' @return a numeric vector of the probabilities that a gene has a nonsilent mutation (not caused by cancer).
#' @keywords background
#' @examples
#' \donttest{
#' # We first need the CCF column
#' sample.genes.mutect <- CCF(sample.genes.mutect)
#' somatic.background <- bcgr(sample.genes.mutect, length.genes$Hugo_Symbol)
#' head(somatic.background)
#' }
#' @export
bcgr <- function(sample.mutations, genes=NULL, Variant_Classification=NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, CCF=NULL){
if (is.atomic(sample.mutations)) {
sample.mutations <- data.frame(x = sample.mutations)
}
if (!is.null(Variant_Classification)){
sample.mutations <- assign.columns(sample.mutations, Variant_Classification, "Variant_Classification")
}
if (!is.null(Hugo_Symbol)){
sample.mutations <- assign.columns(sample.mutations, Hugo_Symbol, "Hugo_Symbol")
}
if (!is.null(Tumor_Sample_Barcode)){
sample.mutations <- assign.columns(sample.mutations, Tumor_Sample_Barcode, "Tumor_Sample_Barcode")
}
if (!is.null(CCF)){
sample.mutations <- assign.columns(sample.mutations, CCF, "CCF")
}
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
if(!'hugo_symbol' %in% colnames(sample.mutations)){
stop(paste("There must be column Hugo_Symbol specified in sample.mutations data frame "), call. = FALSE)
}
if(!'tumor_sample_barcode' %in% colnames(sample.mutations)){
stop(paste("There must be column Tumor_Sample_Barcode specified in sample.mutations data frame "), call. = FALSE)
}
if(!'variant_classification' %in% colnames(sample.mutations)){
stop(paste("There must be column Variant_Classification specified in sample.mutations data frame "), call. = FALSE)
}
if (is.null(genes)){
message('In the function bcgr you did not provide list of genes which were sequenced, union of all protein coding genes (first column of all.genes.lengths) and your genes is taken')
#genes <- unique(sample.mutations$hugo_symbol)
genes <- as.character(union(cDriver::all.genes.lengths$Hugo_Symbol, unique(sample.mutations$hugo_symbol)))
}
if(!is.factor(sample.mutations$hugo_symbol)){
sample.mutations$hugo_symbol <- factor(sample.mutations$hugo_symbol, levels=genes)
}
if(!is.factor(sample.mutations$tumor_sample_barcode)){
sample.mutations$tumor_sample_barcode <- factor(sample.mutations$tumor_sample_barcode, levels=unique(sample.mutations$tumor_sample_barcode))
}
sample.mutations <- exonic.only(sample.mutations)
# estimete number of nonsilent mutations which are high clonal (before cancer occure)
nonsilent.df <- sample.mutations[sample.mutations$variant_classification != 'Silent' & !is.na(sample.mutations$variant_classification), ]
silent.df <- sample.mutations[sample.mutations$variant_classification == 'Silent' & !is.na(sample.mutations$variant_classification), ]
# mutations with CCF above 0.8 are considered clonal | taken as maximum likelihood estimate of lambda (poisson distribution)
#n.healthy <- round( sum( table(nonsilent.df[nonsilent.df$ccf >= 0.85,'tumor_sample_barcode']) )/length(unique(sample.mutations$tumor_sample_barcode)))
n.healthy <- floor(as.numeric(median( table(nonsilent.df[nonsilent.df$ccf >= 0.85,'tumor_sample_barcode']) )))
n.healthy <- max(1, n.healthy)
#silent
mat.sample.gene.s.ccf <- pat.vs.genes(silent.df, genes, valueCol='ccf', sample.gene.lim=1)
observed.mut.s.ccf <- rowSums(mat.sample.gene.s.ccf)
# correction values
prim.corr.s.zero <- cDriver::lawrence.df[genes,]$N_silent/cDriver::lawrence.df[genes,]$N_nonsilent
names(prim.corr.s.zero) <- genes
# indicator which genes are not provided in lawrence table
ind <- names(prim.corr.s.zero[is.na(prim.corr.s.zero)])
avg.corr <- mean(prim.corr.s.zero, na.rm=T) # for missing values, where there is no N_silent and N_nonsilent
if (sum(is.na(prim.corr.s.zero)) > 0){
prim.corr.s.zero[(is.na(prim.corr.s.zero))] <- avg.corr
}
#ind <- (!names(observed.mut.s.ccf) %in% (names(prim.corr.s.zero)))
#prim.corr.s.zero[names(observed.mut.s.ccf)[ind]] <- avg.corr
# calculate corrected (no zeros) somatic silent CCF sums
observed.mut.s.ccf.cor <- c()
observed.mut.s.ccf.cor <- sapply(genes, function(x) max(observed.mut.s.ccf[x], prim.corr.s.zero[x], na.rm=T))
names(observed.mut.s.ccf.cor) <- genes
# estimation of n_a per gene from KaKs
mean.NaNs <- mean((cDriver::lawrence.df[genes,'N_nonsilent']/cDriver::lawrence.df[genes,'N_silent']), na.rm=T)
n_a <- observed.mut.s.ccf.cor[genes]*(cDriver::lawrence.df[genes,'N_nonsilent']/cDriver::lawrence.df[genes,'N_silent'])
names(n_a) <- genes
if (sum(is.na(n_a)) > 0){
warning(paste('For ',sum(is.na(n_a)),' genes there was no Na and Ns values (lawrence.df). Mean ratio of Na/Ns is taken for these genes'))
n_a[is.na(n_a)] <- observed.mut.s.ccf.cor[names(n_a[is.na(n_a)])]*mean.NaNs
}
n_a.prob <- n_a/sum(n_a)
gene.mutated.if.healthy <- sapply(n_a.prob, function(x) sum(dbinom(1:n.healthy, n.healthy, x )))
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
gene.mutated.if.healthy
}
#' Background mutation rate based on the published Lawrence's bmr estimates. The bmr for each gene was estimated from a cohort of different tumors.
#' @description
#' \code{bcgr.lawrence} function calculates the background probability that a gene is mutated based on the background somatic mutation rate
#' provided in the Lawrence paper.
#' @param sample.mutations data frame in MAF like format.
#' Columns names/header in \code{sample.mutations} must be:
#' \itemize{
#' \item Variant_Classification : column specified in the MAF format, which distinguishes between silent and nonsilent SNVs
#' \item Hugo_Symbol : column specified in the MAF format, which reports the gene name for each SNV.
#' \item Tumor_Sample_Barcode : column specified in the MAF format, which reports in wich patient the SNV was found.
#' \item CCF : numeric column produce by \code{CCF} function, or calculated previously for each SNV.
#' }
#' @param genes vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then the list of the unique genes is taken from \code{sample.mutations}.
#' @param lengthGenes numeric vector of the lengths (suquenced) for all \code{genes}. Vector should have the same order as given in the variable genes.
#' Default is NULL value and then the length of the genes is taken from data set \code{length.genes} (form package \code{cDriver}) as defined in column length.
#' If a gene is not found in this data frame, then the median value is taken from the gene list provided by default.
#' @param Variant_Classification (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the classification for the SNVs (Silent or not).
#' Default is NULL value (in this case \code{sample.mutations} should already have this column).
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Hugo_Symbol (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the gene names for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Tumor_Sample_Barcode (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the sample ids for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param CCF (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the cancer cell fraction information for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @details Function uses \code{sample.mutations} to estimate the expected number of nonsilent mutations per patient.
#' @return a numeric vector of the probabilites that a gene has a nonsilent mutation (not caused by cancer).
#' @keywords Lawrence
#' @examples
#' bcgr.L <- bcgr.lawrence(sample.genes.mutect, length.genes$Hugo_Symbol, length.genes$Coverd_len)
#' head(bcgr.L)
#' @references \url{http://www.ncbi.nlm.nih.gov/pubmed/23770567}.
#' @export
bcgr.lawrence <- function(sample.mutations, genes=NULL, lengthGenes=NULL, Variant_Classification=NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, CCF=NULL){
#load('data/lawrence.RData')
#ow <- options("warn")
if (is.atomic(sample.mutations)) {
sample.mutations <- data.frame(x = sample.mutations)
}
if (!is.null(Variant_Classification)){
sample.mutations <- assign.columns(sample.mutations, Variant_Classification, "Variant_Classification")
}
if (!is.null(Hugo_Symbol)){
sample.mutations <- assign.columns(sample.mutations, Hugo_Symbol, "Hugo_Symbol")
}
if (!is.null(Tumor_Sample_Barcode)){
sample.mutations <- assign.columns(sample.mutations, Tumor_Sample_Barcode, "Tumor_Sample_Barcode")
}
if (!is.null(CCF)){
sample.mutations <- assign.columns(sample.mutations, CCF, "CCF")
}
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
if (is.null(lengthGenes)){
message('You did not provide lenght of sequenced genes, so sum of lenghts of all exons for each gene is taken (all.genes.lengths).')
lengthGenes <- cDriver::all.genes.lengths$Length
names(lengthGenes) <- cDriver::all.genes.lengths$Hugo_Symbol
} else if( is.null(names(lengthGenes))) {
if (!is.null(genes)){
names(lengthGenes) <- genes
} else {
stop('When providing length of genes, you need also to provide their names (or as named vector or in genes paramether). ')
}
}
if (is.null(genes)){
message('In the function bcgr.lawrence you did not provide list of genes which were sequenced, union of all protein coding genes (first column of all.genes.lengths) and your genes is taken')
genes <- as.character(union(cDriver::all.genes.lengths$Hugo_Symbol, unique(sample.mutations$hugo_symbol)))
}
if(!is.factor(sample.mutations$hugo_symbol)){
sample.mutations$hugo_symbol <- factor(sample.mutations$hugo_symbol, levels=genes)
}
if(!is.factor(sample.mutations$tumor_sample_barcode)){
sample.mutations$tumor_sample_barcode <- factor(sample.mutations$tumor_sample_barcode, levels=unique(sample.mutations$tumor_sample_barcode))
}
# take only exonic
suppressWarnings( sample.mutations <- exonic.only(sample.mutations) )
## ovde moras napraviti neki warning, vidi koliko gena ima duzinu.
## ako nema daj prosjecnu ali i warning
# From lawrenc
mut.rate <- cDriver::lawrence.df[genes,]$noncoding_mutation_rate
names(mut.rate) <- genes
if(sum(is.na(mut.rate))){
proc.miss.mut.rate <- sum(is.na(mut.rate))/length(genes)
warning("In your list of genes there are genes with unkown mutation rate (lawrence estimates). Median value is taken fot those.
From ",length(genes), " genes there is no mutation rate provided for ", sum(is.na(mut.rate))," \n")
}
mut.rate[is.na(mut.rate)] <- median(mut.rate, na.rm=T)
bcgr <- c()
bcgr.prob <- c()
if(sum(is.na(lengthGenes[genes]))){
# to be sure of order and to add lengths which are in genes but not ine lengthGenes
lengthGenes <- lengthGenes[genes]
names(lengthGenes) <- genes
miss.len <- sum(is.na(lengthGenes))
lengthGenes[is.na(lengthGenes)] <- median(lengthGenes, na.rm=T)
if (miss.len){
proc.mis <- miss.len/length(genes)
#options(warn = w); cat("\n warn =", w, "\n")
warning("In your list of genes there are genes with unkown lenght. Medain value is taken for those.
From ",length(genes), " genes there is no length provided for ",miss.len," \n")
}
} else {
lengthGenes <- lengthGenes[genes]
names(lengthGenes) <- genes
}
Ns.Na <- cDriver::lawrence.df[genes,'N_silent']/cDriver::lawrence.df[genes,'N_nonsilent']
if (sum(is.na(Ns.Na)) > 0){
warning(paste('For ',sum(is.na(Ns.Na)),' genes there was no Na and Ns values (lawrence.df). Mean ratio of Ns/Na is taken for these genes'))
Ns.Na[is.na(Ns.Na)] <- mean(Ns.Na, na.rm=T)
}
bcgr[genes] <- (mut.rate[genes]*lengthGenes[genes])/(1+Ns.Na)
bcgr.prob[genes] <- bcgr[genes] / sum(bcgr[genes])
# estimete number of nonsilent mutations which are high clonal (before cancer occure)
nonsilent.df <- sample.mutations[sample.mutations$variant_classification != 'Silent' & !is.na(sample.mutations$variant_classification), ]
# mutations with CCF above 0.8 are considered clonal | taken as maximum likelihood estimate of lambda (poisson distribution)
#n.healthy <- round( sum( table(nonsilent.df[nonsilent.df$ccf >= 0.85,'tumor_sample_barcode']) )/length(unique(sample.mutations$tumor_sample_barcode)))
n.healthy <- floor(as.numeric(median( table(nonsilent.df[nonsilent.df$ccf >= 0.8,'tumor_sample_barcode']) )))
gene.mutated.if.healthy <- sapply(bcgr.prob, function(x) sum(dbinom(1:n.healthy, n.healthy, x )))
#warnings()
#options(ow) # reset
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
gene.mutated.if.healthy
}
#' Combining two somatic background mutation probability values (usually obtained from functions bcgr.lawrence and bcgr)
#' @description
#' \code{bcgr.combine} function first calculates both somatic background mutation probabilities (using bcgr.lawrence and bcgr functions) and then it takes the average value for each gene
#' @param sample.mutations data frame in MAF like format.
#' Columns names/header in \code{sample.mutations} must be: e:
#' \itemize{
#' \item Variant_Classification : column specified in the MAF format, which distinguishes between silent and nonsilent SNVs
#' \item Hugo_Symbol : column specified in the MAF format, which reports the gene name for each SNV.
#' \item Tumor_Sample_Barcode : column specified in the MAF format, which reports in wich patient the SNV was found.
#' \item CCF : numeric column produce by \code{CCF} function, or calculated previously for each SNV.
#' }
#' @param genes vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then the list of the unique genes is taken from \code{sample.mutations}.
#' @param lengthGenes numeric vector of the lengths (suquenced) for all \code{genes}. Vector should have the same order as given in the variable genes.
#' Default is NULL value and then the length of the genes is taken from data set \code{length.genes} (form package \code{cDriver}) as defined in column length.
#' If a gene is not found in this data frame, then the median value is taken from the gene list provided by default.
#' @param Variant_Classification (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the classification for the SNVs (Silent or not).
#' Default is NULL value (in this case \code{sample.mutations} should already have this column).
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Hugo_Symbol (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the gene names for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Tumor_Sample_Barcode (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the sample ids for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param CCF (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the cancer cell fraction information for the SNVs.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @return a numeric vector of the probabilites that a gene has a nonsilent mutation (not caused by cancer).
#' @keywords Lawrence
#' @examples
#' # First we need to calculate CCF
#' sample.genes.mutect <- CCF(sample.genes.mutect)
#' # Calculate somatic nonsilent background mutation probability
#' background <- bcgr.combine(sample.genes.mutect, length.genes$Hugo_Symbol, length.genes$Coverd_len)
#' head(background)
#' @references \url{http://www.ncbi.nlm.nih.gov/pubmed/23770567}.
#' @export
bcgr.combine <- function(sample.mutations, genes=NULL, lengthGenes=NULL, Variant_Classification=NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, CCF=NULL){
na.p.lawrence <- bcgr.lawrence(sample.mutations, genes=genes, lengthGenes=lengthGenes, Variant_Classification=Variant_Classification, Hugo_Symbol=Hugo_Symbol, Tumor_Sample_Barcode=Tumor_Sample_Barcode, CCF=CCF)
na.p.silent <- bcgr(sample.mutations, genes=genes, Variant_Classification=Variant_Classification, Hugo_Symbol=Hugo_Symbol, Tumor_Sample_Barcode=Tumor_Sample_Barcode, CCF=CCF)
if (is.null(genes)){
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
#genes <- unique(sample.mutations$hugo_symbol)
genes <- as.character(union(cDriver::all.genes.lengths$Hugo_Symbol, unique(sample.mutations$hugo_symbol)))
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
}
#together <- as.data.frame(cbind(na.p.lawrence[genes], na.p.silent[genes]))
#names(together) <- c('noncoding_lawrence','nonsilent_kaks')
#gene.mutated.if.healthy <- apply(as.matrix(together),1,mean)
gene.mutated.if.healthy <- (na.p.lawrence[genes] + na.p.silent[genes])/2
names(gene.mutated.if.healthy) <- genes
return(gene.mutated.if.healthy)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.