######################################################################################
# CRG
# Hana SUSAK
#------------------------------------------------------------------------------------
# Bayes models
######################################################################################
#' @import ggplot2
#' @import data.table
## @import reshape2
#' @import Rmpfr
#' @importFrom stats sd aggregate as.formula prcomp xtabs
#' @importFrom grDevices dev.off pdf
#' @importFrom utils methods write.table
## @importFrom base apply
# Function to create matrix genes versus patients.
# Values of matrix can be counts of mutations, or sum of some other variable.
# Also it user can provide upper limit for sum of this value for gene-patienet pair.
# @param sample.mutations data frame to be converted gene-patient matrix
# @param genes vector of unique genes of interest
# @param valueCol Column for which cuming gonna be done (like CCF, Demage), should be numeric or character (exact name of column).
# If NULL, it will count occurences.
# @param Hugo_Symbol (optional) integer/numeric value indicating column in \code{sample.mutations} having gene names for reported SNVs.
# Default is NULL value (in this case \code{sample.mutations} should already have this column)
# @param Tumor_Sample_Barcode (optional) integer/numeric value indicating column in \code{sample.mutations} which have sample ids for SNVs.
# Default is NULL value (in this case \code{sample.mutations} should already have this column)
# @param sample.gene.lim a numeric value specifying upper limit when summing is perfomed for gene-sample pair
# @param mode a charechter value indicationg how to solve when in one gene-sample pair there are multiple mutations. Options are SUM, MAX and ADVANCE
# @param epsilon a numeric value. If mode is ADVANCE, epsilone value will be threshold for CCF difference to decide if they are in same or different clone.
pat.vs.genes <- function (sample.mutations, genes = NULL, valueCol = NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, sample.gene.lim = NULL, mode='MAX', epsilon= 0.05 ){
if (is.atomic(sample.mutations)) {
sample.mutations <- data.frame(x = sample.mutations)
}
mode <- toupper(mode)
if ( !mode %in% c('SUM', 'MAX', 'ADVANCE')) {
stop("mode mast be or SUM or MAX or ADVACE! ", call. = FALSE)
}
if (!is.null(Hugo_Symbol)){
sample.mutations <- assign.columns(sample.mutations, Hugo_Symbol, "Hugo_Symbol")
}
if (!is.null(Tumor_Sample_Barcode)){
sample.mutations <- assign.columns(sample.mutations, Tumor_Sample_Barcode, "Tumor_Sample_Barcode")
}
if (!is.null(sample.gene.lim) & (length(sample.gene.lim) > 1 | !is.numeric(sample.gene.lim))) {
stop("sample.gene.lim must be a single numeric variable", call. = FALSE)
}
if (!is.null(valueCol) & (length(valueCol) > 1 )) {
stop("valueCol must be a single variable, charachter or numeric", call. = FALSE)
}
if( !is.null(valueCol) ){
if (is.character(valueCol) & (! valueCol %in% colnames(sample.mutations))) {
stop("valueCol must one of column names in sample.mutations", call. = FALSE)
}
if (is.numeric(valueCol) & (valueCol > ncol(sample.mutations))) {
stop("valueCol must be numeric value indicating valid column", call. = FALSE)
}
}
#if (is.null(valueCol) ){
# message('As valueCol is not specified only counting of mutations for sample-gene pairs will be done.')
#}
if (is.null(valueCol) & !is.null(sample.gene.lim) ){
message('With limit value for sample-gene pair, there need to be specified targeted column (valueCol). Limit value will be ignored.')
}
#if ( mode!='ADVANCE' & !is.null(epsilon) ){
# message('Epsilon value is only used with mode ADVANCE. Epsilon will be ignored.')
#}
##########
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
if (is.null(genes)){
genes <- as.character(unique(sample.mutations$hugo_symbol))
}
advance.ccf.calc <- function(x, epsilon){
x <- sort(x, decreasing=T)
sum.ccf <- x[1]
if (length(x) >1){
for (i in 2:length(x)) {
if ((x[i-1] - x[i]) > epsilon){
sum.ccf <- sum.ccf + x[i]
}
}
}
return(sum.ccf)
}
# solve NA values for valueCol, set to 1
sample.mutations2 <- sample.mutations
if (!is.null(valueCol)){
ind <- is.na(sample.mutations2[,valueCol])
if (sum(ind)) {
warning("There are ", sum(ind), " NA values in column ", valueCol, " which are replaced by 1! \n")
sample.mutations2[ind, valueCol] <- 1
}
}
#test <- acast(sample.mutations2, Hugo_Symbol ~ Tumor_Sample_Barcode, sum )
### creat data table, to be faster ###
sample.gene.dt <- data.table(sample.mutations2)
keycols <- c("hugo_symbol","tumor_sample_barcode")
setkeyv(sample.gene.dt,keycols)
if(!is.factor(sample.gene.dt$hugo_symbol)){
sample.gene.dt$hugo_symbol <- factor(sample.gene.dt$Hugo_Symbol, levels = (genes))
}
if(!is.factor(sample.gene.dt$tumor_sample_barcode)){
sample.gene.dt$tumor_sample_barcode <- factor(sample.gene.dt$tumor_sample_barcode, levels= sort(as.vector(unique(sample.gene.dt$tumor_sample_barcode))) )
}
if (mode == 'SUM' & (!is.null(valueCol) )){
# if we sum values of column valueCol, for same sample-gene pairs
if (is.null(sample.gene.lim)){
sample.gene.lim <- Inf
}
if (is.character(valueCol)){
f <- paste(valueCol, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(sum(x),sample.gene.lim))
} else if (is.numeric(valueCol)){
name.col <- colnames(sample.mutations2)[as.integer(valueCol)]
f <- paste(name.col, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(sum(x),sample.gene.lim))
} else {
stop('No valid sum column (should be CCF or Damage score - like Condel)')
}
} else if (mode == 'MAX' & (!is.null(valueCol) )) {
# if we take max value of column valueCol, for same sample-gene pairs
if (is.null(sample.gene.lim)){
sample.gene.lim <- Inf
}
if (is.character(valueCol)){
f <- paste(valueCol, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(max(x, na.rm=T),sample.gene.lim))
} else if (is.numeric(valueCol)){
name.col <- colnames(sample.mutations2)[as.integer(valueCol)]
f <- paste(name.col, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(max(x, na.rm=T),sample.gene.lim))
} else {
stop('No valid sum column (should be CCF or Damage score - like Condel)')
}
} else if (mode == 'ADVANCE' & (!is.null(valueCol) ) ){
# if we take max or sum value of column valueCol, for same sample-gene pairs, depending if in same or different clone
if (is.null(sample.gene.lim)){
sample.gene.lim <- Inf
}
#sample.gene.dt <- sample.gene.dt[order(sample.gene.dt$Hugo_Symbol, sample.gene.dt$Tumor_Sample_Barcode, -sample.gene.dt$CCF),]
if (is.character(valueCol)){
f <- paste(valueCol, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(advance.ccf.calc(x, epsilon),sample.gene.lim))
} else if (is.numeric(valueCol)){
name.col <- colnames(sample.mutations2)[as.integer(valueCol)]
f <- paste(name.col, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
sample.gene.dt <- aggregate(as.formula(f) , data=sample.gene.dt, function(x) min(advance.ccf.calc(x, epsilon),sample.gene.lim))
} else {
stop('No valid sum column (should be CCF or Damage score - like Condel)')
}
}
# creating matrix
if (is.null(valueCol)){
## just counts
mat<- xtabs(~ hugo_symbol + tumor_sample_barcode, data=sample.gene.dt)
} else {
if (is.character(valueCol)){
f <- paste(valueCol, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
mat <- xtabs(as.formula(f), data=sample.gene.dt)
} else if (is.numeric(valueCol)){
name.col <- colnames(sample.mutations2)[as.integer(valueCol)]
f <- paste(name.col, " ~ hugo_symbol + tumor_sample_barcode", collapse="")
mat <- xtabs(as.formula(f), data=sample.gene.dt)
} else {
stop('No valid sum column (should be CCF or Damage score - like Condel)')
}
}
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
class(mat) <- 'matrix'
return(mat)
}
#' Bayesian 'risk' inference model.
#' @description
#' \code{bayes.risk} function runs a Bayesian inference model based on cancer versus healthy population model. The priors are set by the user or can be the likelihood is calculated from the data of SNVs/InDels.
#' @param sample.mutations data frame in MAF like format with nonsilent and silent mutations.
#' Columns names/header in \code{sample.mutations} must be:
#' \itemize{
#' \item Variant_Classification : column specified in the MAF format, which distinguishes between silent and nonsilent SNVs
#' \item Hugo_Symbol : column specified in the MAF format, which reports the gene name for each SNV.
#' \item Tumor_Sample_Barcode : column specified in the MAF format, which reports in wich patient the SNV was found.
#' \item CCF : numeric column produce by the \code{CCF} function, or calculated previously for each SNV.
#' \item Damage_score numeric column with values between 0 and 1, where 1 means very damaging SNV/IndDel and 0 not damaging SNV/InDel
#' }
#' @param bcgr.prob A numeric vector. It must have the same length and order as the genes vector. It indicates the probability of a gene having a somatic mutation in the healthy population.
#' There are functions for obtaining this vector: \code{bcgr}, \code{bcgr.lawrence} and \code{bcgr.combine}.
#' @param prior.sick A numeric value representing the incidence of tumor in the population. Set by default to 0.0045.
#' @param genes (optional) vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then list of unique genes is taken from \code{sample.mutations}.
#' @param Variant_Classification (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the classification for the SNVs/Indels (Silent or not).
#' Default is NULL value (in this case \code{sample.mutations} should already have this column).
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Hugo_Symbol (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the gene names for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Tumor_Sample_Barcode (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the sample ids for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param CCF (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the cancer cell fraction information for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Damage_score (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the damage score for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' @param mode A character value indicating how to solve when in one gene-sample pair there are multiple mutations. Options are SUM, MAX and ADVANCE
#' @param epsilon A numeric value. If mode is ADVANCE, epsilone value will be the threshold for CCF difference to decide if they are in the same or in a different clone.
#' @return a data frame with ranked genes ordered the by posterior probability of the gene being a risk factor for developing tumor.
#' Additional columns with useful info are contained in the data frame.
#' @seealso \code{\link{bcgr}}, \code{\link{bcgr.lawrence}} and \code{\link{bcgr.combine}} for obtaining bcgr.prob variable.
#' @keywords risk
#' @examples
#' \donttest{
#' # first calculate CCF
#' sample.genes.mutect <- CCF(sample.genes.mutect)
#' # then somatic background probability
#' bcgr.prob <- bcgr.combine(sample.genes.mutect)
#' # bayes risk model
#' risk.genes <- bayes.risk(sample.genes.mutect, bcgr.prob, prior.sick = 0.00007)
#' head(risk.genes)
#' }
#' @export
bayes.risk <- function(sample.mutations, bcgr.prob, genes=NULL, prior.sick = 0.0045,
Variant_Classification=NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, CCF=NULL, Damage_score=NULL , mode='MAX', epsilon=0.05 ) {
if (is.atomic(sample.mutations)) {
sample.mutations <- data.frame(x = sample.mutations)
}
mode <- toupper(mode)
if ( !mode %in% c('SUM', 'MAX', 'ADVANCE')) {
stop("mode mast be or SUM or MAX or ADVACE! ", call. = FALSE)
}
if (!is.null(Variant_Classification)){
sample.mutations <- assign.columns(sample.mutations, Variant_Classification, "Variant_Classification")
}
if (!is.null(Hugo_Symbol)){
sample.mutations <- assign.columns(sample.mutations, Hugo_Symbol, "Hugo_Symbol")
}
if (!is.null(Tumor_Sample_Barcode)){
sample.mutations <- assign.columns(sample.mutations, Tumor_Sample_Barcode, "Tumor_Sample_Barcode")
}
if (!is.null(CCF)){
sample.mutations <- assign.columns(sample.mutations, CCF, "CCF")
}
if (!is.null(Damage_score)){
sample.mutations <- assign.columns(sample.mutations, Damage_score, "Damage_score")
}
##########
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
if (is.null(genes)){
genes <- as.character(unique(sample.mutations$hugo_symbol))
}
if(!is.factor(sample.mutations$hugo_symbol)){
sample.mutations$hugo_symbol <- factor(sample.mutations$hugo_symbol, levels=genes)
}
if(!is.factor(sample.mutations$tumor_sample_barcode)){
sample.mutations$tumor_sample_barcode <- factor(sample.mutations$tumor_sample_barcode, levels=unique(sample.mutations$tumor_sample_barcode))
}
prior.healthy <- 1 - prior.sick
# take only exonic
sample.mutations <- exonic.only(sample.mutations)
# matrices genes vs. samples
mat.sample.gene.ns <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes)
mat.sample.gene.s <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification == 'Silent',], genes )
if ('damage_score' %in% colnames(sample.mutations)){
mat.sample.gene.ns.Damage <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes, valueCol='damage_score', sample.gene.lim=1, mode=mode, epsilon)
} else {
mat.sample.gene.ns.Damage <- matrix(1, nrow=nrow(mat.sample.gene.ns), ncol=ncol(mat.sample.gene.ns))
}
mat.sample.gene.ns.ccf <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes, valueCol='ccf', sample.gene.lim=1, mode=mode, epsilon)
mat.sample.gene.s.ccf <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification == 'Silent',], genes, valueCol='ccf', sample.gene.lim=1, mode=mode, epsilon)
#condtitionals
mat.ns.somatic <- (mat.sample.gene.ns.Damage*mat.sample.gene.ns.ccf)
gene.mutated.if.sick <- rowSums(mat.ns.somatic)/length(unique(sample.mutations$tumor_sample_barcode))
gene.mutated.if.healthy <- bcgr.prob
bayes.risk <- (prior.sick * gene.mutated.if.sick[genes])/(prior.sick*gene.mutated.if.sick[genes] + prior.healthy*gene.mutated.if.healthy[genes])
#additional columns
observed.mut.ns <- base::apply(as.matrix(mat.sample.gene.ns),c(1),function(x) sum(x >0))
total.mut.ns <- rowSums(mat.sample.gene.ns)
observed.mut.s <- base::apply(as.matrix(mat.sample.gene.s),c(1),function(x) sum(x >0))
observed.mut.ns.ccf <- rowSums(mat.sample.gene.ns.ccf)
observed.mut.s.ccf <- rowSums(mat.sample.gene.s.ccf)
n <- rowSums(mat.ns.somatic)
n.indels <- table(sample.mutations[sample.mutations$variant_type %in% c('DEL', 'INS'),'hugo_symbol'], exclude=F)
top <- as.character(names(sort(bayes.risk, decreasing=T)))
sample.ns.mut <- paste('sample_ns_mut(',length(levels(sample.mutations$tumor_sample_barcode)),')',sep="")
sample.s.mut <- paste('sample_s_mut(',length(levels(sample.mutations$tumor_sample_barcode)),')',sep="")
final.df.bayes1 <- data.frame('Hugo_Symbol' = top,
'postProb' = bayes.risk[top],
'CCF_x_damage' = n[top],
'fold' = bayes.risk[top]/prior.sick,
'sample_ns_mut' = observed.mut.ns[top],
'total_ns_mut'= total.mut.ns[top],
'nonsilent_CCF_sum' = observed.mut.ns.ccf[top],
'sample_s_mut'= observed.mut.s[top],
'silent_CCF_sum' = observed.mut.s.ccf[top],
'indels'= as.numeric(n.indels[top]),
'background_mut_prob'=bcgr.prob[top],
'ccf_mean'= base::apply(as.matrix(mat.sample.gene.ns.ccf[top,]),1,function(x) mean(x[x!=0])),
'ccf_median'= base::apply(as.matrix(mat.sample.gene.ns.ccf[top,]),1,function(x) median(x[x!=0])),
'ccf_sd'= base::apply(as.matrix(mat.sample.gene.ns.ccf[top,]),1,function(x) sd(x[x!=0])),
'rank'=1:length(top)
)
colnames(final.df.bayes1)[5] <- sample.ns.mut
colnames(final.df.bayes1)[8] <- sample.s.mut
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
return(final.df.bayes1)
}
#' Bayesian 'driver' inference model.
#' @description
#' \code{bayes.driver} function runs a Bayesian inference model based on a driver versus passenger model. Each observed mutation in the gene is taken as a proof for being a true driver.
#' The priors can be set or the function will take some default values.
#' @param sample.mutations data frame with SNVs and InDels in MAF like format.
#' Columns (with exactly same names) which \code{sample.mutations} should have are:
#' \itemize{
#' \item Variant_Classification column specifed by MAF format, used to distinguish between silent and nonsilent SNVs
#' \item Hugo_Symbol column specifed by MAF format, which reports gene for each SNV.
#' \item Tumor_Sample_Barcode column specifed by MAF format, reporting for each SNV in wich patient was found.
#' \item CCF numeric column produce by \code{CCF} function.
#' \item Damage_score numeric column with values between 0 and 1, where 1 means very damaging SNV/IndDel and 0 not damaging SNV/InDel
#' }
#' @param bcgr.prob A numeric vector. It must have the same length and order as the genes vector. It indicates the probability of a gene having a somatic mutation in the healthy population.
#' There are functions for obtaining this vector: \code{bcgr}, \code{bcgr.lawrence} and \code{bcgr.combine}.
#' @param prior.driver A numeric value representing the prior probability that a random gene is a driver.
#' Default is set to \code{length(driver.genes)}/20000, as the human genome has ~20000 protein coding genes.
#' @param gene.mut.driver A numeric value or named vector representing the likelihood that a gene is mutated if it is a known/published driver gene.
#' The gene does not need to be mutated if it is a driver, as cancers are heterogenous. Default is set to NULL and driver.genes are the genes considered as drivers.
#' @param driver.genes A character vector of genes which are considered as drivers for this cancer. If NULL then the used set is \code{driver.genes.concensus}.
#' @param genes (optional) vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then list of unique genes is taken from \code{sample.mutations}.
#' @param Variant_Classification (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the classification for the SNVs/Indels (Silent or not).
#' Default is NULL value (in this case \code{sample.mutations} should already have this column).
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Hugo_Symbol (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the gene names for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Tumor_Sample_Barcode (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the sample ids for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param CCF (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the cancer cell fraction information for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' Column with this name should not already exist in \code{sample.mutations}.
#' @param Damage_score (optional) integer/numeric value indicating which column in \code{sample.mutations} contains the damage score for the SNVs/Indels.
#' Default is NULL value (in this case \code{sample.mutations} should already have this column)
#' @param mode a character value indicating how to solve when in one gene-sample pair there are multiple mutations. Options are SUM, MAX and ADVANCE
#' @param epsilon a numeric value. If mode is ADVANCE, epsilon value will be the threshold for the CCF difference to decide if they are in the same or a different clone.
#' @return a data frame with ranked genes ordered the by posterior probability of the gene being a mutational driver gene.
#' Additional columns with useful info are contained in the data frame.
#' @seealso \code{\link{bcgr}}, \code{\link{bcgr.lawrence}} and \code{\link{bcgr.combine}} for obtaining bcgr.prob variable.
#' @keywords risk
#' @examples
#' \donttest{
#' # first calculate CCF
#' sample.genes.mutect <- CCF(sample.genes.mutect)
#' # then somatic background probability
#' bcgr.prob <- bcgr.combine(sample.genes.mutect)
#' # bayes risk model
#' driver.genes <- bayes.driver(sample.genes.mutect, driver = 0.001, gene.mut.driver=1/50)
#' head(driver.genes)
#' }
#' @export
bayes.driver <- function(sample.mutations, bcgr.prob, genes=NULL, prior.driver = NULL, gene.mut.driver=NULL, driver.genes=NULL,
Variant_Classification=NULL, Hugo_Symbol=NULL, Tumor_Sample_Barcode=NULL, CCF=NULL, Damage_score=NULL , mode='MAX', epsilon=0.05 ) {
if (is.atomic(sample.mutations)) {
sample.mutations <- data.frame(x = sample.mutations)
}
mode <- toupper(mode)
if ( !mode %in% c('SUM', 'MAX', 'ADVANCE')) {
stop("mode mast be or SUM or MAX or ADVACE! ", call. = FALSE)
}
if (!is.null(Variant_Classification)){
sample.mutations <- assign.columns(sample.mutations, Variant_Classification, "Variant_Classification")
}
if (!is.null(Hugo_Symbol)){
sample.mut9ations <- assign.columns(sample.mutations, Hugo_Symbol, "Hugo_Symbol")
}
if (!is.null(Tumor_Sample_Barcode)){
sample.mutations <- assign.columns(sample.mutations, Tumor_Sample_Barcode, "Tumor_Sample_Barcode")
}
if (!is.null(CCF)){
sample.mutations <- assign.columns(sample.mutations, CCF, "CCF")
}
if (!is.null(Damage_score)){
sample.mutations <- assign.columns(sample.mutations, Damage_score, "Damage_score")
}
if(is.null(driver.genes) & is.null(gene.mut.driver)){
driver.genes <- cDriver::driver.genes.concensus
}
if (is.null(prior.driver)){
if(is.null(gene.mut.driver)){
prior.driver <- length(driver.genes)/20000
} else {
prior.driver <- length(gene.mut.driver)/20000
}
}
##########
# make it not sensitive to lower/upper case in column names
original.col.names <- colnames(sample.mutations)
num.col <- ncol(sample.mutations)
colnames(sample.mutations) <- tolower(colnames(sample.mutations))
if (is.null(genes)){
genes <- as.character(unique(sample.mutations$hugo_symbol))
}
if(!is.factor(sample.mutations$hugo_symbol)){
sample.mutations$hugo_symbol <- factor(sample.mutations$hugo_symbol, levels=genes)
}
if(!is.factor(sample.mutations$tumor_sample_barcode)){
sample.mutations$tumor_sample_barcode <- factor(sample.mutations$tumor_sample_barcode, levels=unique(sample.mutations$tumor_sample_barcode))
}
prior.passenger <- 1 - prior.driver
# take only exonic
sample.mutations <- exonic.only(sample.mutations)
#condtitionals
if(is.null(gene.mut.driver)){
num.mut <- nrow(sample.mutations[sample.mutations$hugo_symbol %in% driver.genes,])
num.pat <- length(unique(as.character(sample.mutations$tumor_sample_barcode)))
num.canc.genes <- length(unique(driver.genes))
gene.mut.if.driver <- c()
gene.mut.if.driver[genes] <- num.mut/(num.pat*num.canc.genes)
} else {
if(is.null(names(gene.mut.driver))){
gene.mut.if.driver <- c()
gene.mut.if.driver[genes] <- gene.mut.driver
} else {
gene.mut.if.driver[genes] <- gene.mut.driver[genes]
gene.mut.if.driver[is.na(gene.mut.if.driver)] <- 0
names(gene.mut.if.driver) <- genes
}
}
# make sure that probability that gene is mutated if a driver is bigger or equal then if passenger
#gene.mut.if.driver <- mapply(max, gene.mut.if.driver[genes], bcgr.prob[genes])
gene.mut.if.passenger <- bcgr.prob
gene.notmut.if.passenger <- 1 - gene.mut.if.passenger
gene.notmut.if.driver <- 1 - gene.mut.if.driver
# matrices genes vs. samples
mat.sample.gene.ns <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes)
mat.sample.gene.s <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification == 'Silent',], genes)
if ('damage_score' %in% colnames(sample.mutations) ){
mat.sample.gene.ns.Damage <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes, valueCol='damage_score', sample.gene.lim=1, mode=mode, epsilon)
} else {
mat.sample.gene.ns.Damage <- matrix(1, nrow=nrow(mat.sample.gene.ns), ncol=ncol(mat.sample.gene.ns))
}
mat.sample.gene.ns.ccf <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification != 'Silent',], genes, valueCol='ccf', sample.gene.lim=1, mode=mode, epsilon)
mat.sample.gene.s.ccf <- pat.vs.genes(sample.mutations[sample.mutations$variant_classification == 'Silent',], genes, valueCol='ccf', sample.gene.lim=1, mode=mode, epsilon)
# ccf corrected counts
n <- rowSums(mat.sample.gene.ns.Damage*mat.sample.gene.ns.ccf)
m <- ncol(mat.sample.gene.ns.Damage) - n
if (FALSE){
logxpy <- function(lx,ly) max(lx,ly) + log1p(exp(-abs(lx-ly)))
# A/A+B
lA <- log(prior.driver) + n[genes]*log(gene.mut.if.driver[genes] ) + m[genes]*log(gene.notmut.if.driver[genes])
#A <- log(prior.driver) + n*log(gene.mut.if.driver) + m*log(gene.notmut.if.driver)
#print(lA['TP53'])
lB <- log(prior.passenger) + n[genes]*log(gene.mut.if.passenger[genes]) + m[genes]*log(gene.notmut.if.passenger[genes])
#B <- log(prior.passenger) + n*log(gene.mut.if.passenger['TP53']) + m*log(gene.notmut.if.passenger['TP53'])
#print(lB['TP53'])
gene.driver.given.patients <- exp(lA -
(lB + log1p( (prior.driver/prior.passenger)*
((gene.mut.if.driver[genes]/gene.mut.if.passenger[genes])^n[genes])*
((gene.notmut.if.driver[genes]/gene.notmut.if.passenger[genes])^m[genes]) )))
#print(gene.driver.given.patients['TP53'])
}
#
#ind <- (gene.mut.if.driver[genes]^n[genes] == 0 ) | (gene.notmut.if.driver[genes]^m[genes] == 0) | (gene.mut.if.passenger[genes]^n[genes] == 0) | (gene.notmut.if.passenger[genes]^m[genes] == 0)
ind <- sapply(genes, function(x) ((gene.mut.if.driver[x]^n[x] == 0) | (gene.notmut.if.driver[x]^m[x] == 0) | (gene.mut.if.passenger[x]^n[x] == 0) | (gene.notmut.if.passenger[x]^m[x] == 0)))
if (sum(ind)){
genes1 <- genes[!ind]
genes2 <- genes[ind]
gene.driver.given.patients <- (prior.driver * gene.mut.if.driver[genes1]^n[genes1]*gene.notmut.if.driver[genes1]^m[genes1])/
( (prior.driver* gene.mut.if.driver[genes1]^n[genes1]*gene.notmut.if.driver[genes1]^m[genes1])+
(prior.passenger* gene.mut.if.passenger[genes1]^n[genes1]*gene.notmut.if.passenger[genes1]^m[genes1]) )
# only to be able to run as Rscript
requireNamespace('methods')
gene.driver.given.patients2 <- (prior.driver * mpfr(gene.mut.if.driver[genes2], precBits = 128)^n[genes2] * mpfr(gene.notmut.if.driver[genes2], precBits = 128)^m[genes2])/
( (prior.driver* mpfr(gene.mut.if.driver[genes2], precBits = 128)^n[genes2]*mpfr(gene.notmut.if.driver[genes2], precBits = 128)^m[genes2])+
(prior.passenger* mpfr(gene.mut.if.passenger[genes2], precBits = 128)^n[genes2]*mpfr(gene.notmut.if.passenger[genes2], precBits = 128)^m[genes2]) )
gene.driver.given.patients2 <- as.numeric(gene.driver.given.patients2)
names(gene.driver.given.patients2) <- genes2
gene.driver.given.patients <- c(gene.driver.given.patients, gene.driver.given.patients2)
gene.driver.given.patients <- gene.driver.given.patients[genes]
} else {
gene.driver.given.patients <- (prior.driver * gene.mut.if.driver[genes]^n[genes]*gene.notmut.if.driver[genes]^m[genes])/
( (prior.driver* gene.mut.if.driver[genes]^n[genes]*gene.notmut.if.driver[genes]^m[genes])+
(prior.passenger* gene.mut.if.passenger[genes]^n[genes]*gene.notmut.if.passenger[genes]^m[genes]) )
}
#additional columns
observed.mut.ns <- base::apply(mat.sample.gene.ns,1,function(x) sum(x >0))
total.mut.ns <- rowSums(mat.sample.gene.ns)
observed.mut.s <- base::apply(mat.sample.gene.s,1,function(x) sum(x >0))
observed.mut.ns.ccf <- rowSums(mat.sample.gene.ns.ccf)
observed.mut.s.ccf <- rowSums(mat.sample.gene.s.ccf)
n.indels <- table(sample.mutations[sample.mutations$variant_type %in% c('DEL', 'INS'),'hugo_symbol'], exclude=F)
df.temp <- as.data.frame(cbind(gene.driver.given.patients[genes],n[genes]))
df.temp <- df.temp[ order(round(df.temp$V1, digits=4), df.temp$V2, decreasing=T), ]
#top <- as.character(names(sort(gene.driver.given.patients, decreasing=T)))
top <- as.character(rownames(df.temp))
sample.ns.mut <- paste('sample_ns_mut(',length(levels(sample.mutations$tumor_sample_barcode)),')',sep="")
sample.s.mut <- paste('sample_s_mut(',length(levels(sample.mutations$tumor_sample_barcode)),')',sep="")
final.df.bayes2 <- data.frame('Hugo_Symbol' = top,
'postProb' = round(gene.driver.given.patients[top], digits=4),
'CCF_x_damage' = n[top],
'fold' = gene.driver.given.patients[top]/prior.driver,
'sample_ns_mut' = observed.mut.ns[top],
'total_ns_mut'= total.mut.ns[top],
'nonsilent_CCF_sum' = observed.mut.ns.ccf[top],
'silent'= observed.mut.s[top],
'silent_CCF_sum' = observed.mut.s.ccf[top],
'indels'= as.numeric(n.indels[top]),
'background_mut_prob'=bcgr.prob[top],
'ccf_mean'= base::apply(mat.sample.gene.ns.ccf[top,],1,function(x) mean(x[x!=0])),
'ccf_median'= base::apply(mat.sample.gene.ns.ccf[top,],1,function(x) median(x[x!=0])),
'ccf_sd'= base::apply(mat.sample.gene.ns.ccf[top,],1,function(x) sd(x[x!=0])),
'gene_mut_if_driver'=gene.mut.if.driver[top],
'rank'=1:length(top)
)
colnames(final.df.bayes2)[5] <- sample.ns.mut
colnames(final.df.bayes2)[8] <- sample.s.mut
#return original names
colnames(sample.mutations)[1:num.col] <- original.col.names
# correct to NA postProb for genes without any nonsilent mutation
#final.df.bayes2[final.df.bayes2$total_ns_mut == 0, 'postProb'] <- NA
#final.df.bayes2 <- final.df.bayes2[order(final.df.bayes2$postProb, decreasing=T, na.last=T), ]
return(final.df.bayes2)
}
#' Combining mulitple methods for ranking genes.
#' @description
#' \code{combine.ranking} function combines multiple rankings and gives a final rank to each gene.
#' @param rankings A list containing at least two elements. The first two elements should be the two data frames produced by the bayes.risk and the bayes.driver functions.
#' All the next elements of the list, must be named numeric vectors, where the names are the Hugo_Symbol and the value is the rank given to the each gene.
#' @param genes (optional) vector of genes which were sequenced.
#' Vector of unique values of Hugo_Symbol names (with possibility of more additional genes which did not have any SNV in the cohort).
#' Default is NULL value and then list of unique genes is taken from \code{sample.mutations}.
#' @param min.mut a numeric value, threshold for filtering genes based on a minimum number of nonsilent mutations.
#' @return a data frame with ranked genes ordered the by the combination of two or more list of ranked genes.
#' Additional columns with useful info are contained in the data frame.
#' @seealso \code{\link{bayes.risk}}, \code{\link{bayes.driver}} for obtaining bayes.risk.rank and bayes.driver.rank variables.
#' @keywords combine
# @examples
#' @export
combine.ranking <- function(rankings=list(bayes.risk.rank, bayes.driver.rank), genes = NULL, min.mut = 2 ){
bayes.risk.rank <- rankings[[1]]
bayes.driver.rank <- rankings[[2]]
if (is.null(genes)){
if(!'Hugo_Symbol' %in% colnames(bayes.driver.rank)){
stop(paste("If genes not provided, then there must be column Hugo_Symbol specified in first dataframe in list rankings "), call. = FALSE)
}
genes <- as.character(unique(bayes.driver.rank$Hugo_Symbol))
}
rank.df <- data.frame('Hugo_Symbol' = genes,
'bayes_risk' = (bayes.risk.rank[genes,]$postProb),
'bayes_driver' = (bayes.driver.rank[genes,]$postProb),
'risk_rank' = (bayes.risk.rank[genes,]$rank),
'driver_rank' = (bayes.driver.rank[genes,]$rank),
'sample_ns_mut' = bayes.driver.rank[genes,5],
'total_ns_mut'= bayes.driver.rank[genes,]$total_ns_mut,
'ccf_sum' = bayes.driver.rank[genes,]$nonsilent_CCF_sum ,
'CCFxDamage'= bayes.driver.rank[genes,]$CCF_x_damage,
'indels_count'= bayes.driver.rank[genes,]$indels ,
'mut_rate' = bayes.driver.rank[genes,]$background_mut_prob
)
colnames(rank.df)[6] <- colnames(bayes.driver.rank)[5]
if (length(rankings) > 2) {
n <- length(rankings)
for (i in 3:n){
rank.df[,paste('score', i, sep='', collapse='')] <- rankings[[i]][genes]
}
}
suppressWarnings(
rank.df <- rank.df[rank.df$sample_ns_mut >= min.mut & !is.na(rank.df$sample_ns_mut), ]
)
rank.df$rank_bayes_risk <- rank( ( rank.df$risk_rank), ties.method="min")
rank.df$rank_bayes_driver <- rank( ( rank.df$driver_rank), ties.method="min")
if (length(rankings) > 2) {
n <- length(rankings)
for (i in 3:n){
rank.df[,paste('rank', i, sep='', collapse='')] <- rank( (- rank.df[,paste('score', i, sep='', collapse='')] ), ties.method="min")
}
}
#rank.df <- rank.df[with(rank.df, order(rank.bayes.driver, decreasing=F)),]
if (length(rankings) <= 2) {
prc.rank <- prcomp(~rank_bayes_risk+rank_bayes_driver,rank.df, center=F)
} else {
n <- length(rankings)
formula<- paste('~rank_bayes_risk+rank_bayes_driver', paste('rank',3:n, sep='', collapse='+'), sep='+', collapse='')
prc.rank <- prcomp(as.formula(formula),rank.df, center=F)
}
ind <- prc.rank$rotation[1,1] < 0
genes.order <- names(sort(prc.rank$x[,1], decreasing=ind))
rank.df <- rank.df[genes.order, ]
rank.df$comb_rank <- rank( abs(sort((prc.rank$x[,1]), decreasing=ind)), ties.method="min")
return(rank.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.