Nothing
#' Detect cancer driver genes based on positional clustering of variants.
#'
#' @description Clusters variants based on their position to detect disease causing genes.
#' @details This is the re-implimentation of algorithm defined in OncodriveCLUST article. Concept is based on the fact that most of the variants in cancer causing genes are enriched at few specific loci (aka hotspots).
#' This method takes advantage of such positions to identify cancer genes. Cluster score of 1 means, a single hotspot hosts all observed variants. If you use this function, please cite OncodriveCLUST article.
#' @references Tamborero D, Gonzalez-Perez A and Lopez-Bigas N. OncodriveCLUST: exploiting the positional clustering of somatic mutations to identify cancer genes. Bioinformatics. 2013; doi: 10.1093/bioinformatics/btt395s
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param AACol manually specify column name for amino acid changes. Default looks for field 'AAChange'
#' @param pvalMethod either zscore (default method for oncodriveCLUST), poisson or combined (uses lowest of the two pvalues).
#' @param nBgGenes minimum number of genes required to estimate background score. Default 100. Do not change this unless its necessary.
#' @param minMut minimum number of mutations required for a gene to be included in analysis. Default 5.
#' @param bgEstimate If FALSE skips background estimation from synonymous variants and uses predifined values estimated from COSMIC synonymous variants.
#' @param ignoreGenes Ignore these genes from analysis. Default NULL. Helpful in case data contains large number of variants belonging to polymorphic genes such as mucins and TTN.
#' @return data table of genes ordered according to p-values.
#' @seealso \code{\link{plotOncodrive}}
#' @examples
#'
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' laml.sig <- oncodrive(maf = laml, AACol = 'Protein_Change', minMut = 5)
#'
#' @export
oncodrive = function(maf, AACol = NULL, minMut = 5, pvalMethod = 'zscore', nBgGenes = 100, bgEstimate = TRUE, ignoreGenes = NULL){
warning("Oncodrive has been superseeded by OncodriveCLUSTL. See http://bg.upf.edu/group/projects/oncodrive-clust.php")
#Proetin Length source
gl = system.file('extdata', 'prot_len.txt.gz', package = 'maftools')
if(Sys.info()[['sysname']] == 'Windows'){
gl.gz = gzfile(description = gl, open = 'r')
gl <- suppressWarnings( data.table(read.csv( file = gl.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
close(gl.gz)
} else{
gl = data.table::fread(cmd = paste('zcat <', gl), sep = '\t', stringsAsFactors = FALSE)
}
pval.options = c('zscore', 'poisson', 'combined')
if(!pvalMethod %in% pval.options){
stop('pvalMethod can only be either zscore, poisson or combined')
}
if(length(pvalMethod) > 1){
stop('pvalMethod can only be either zscore, poisson or combined')
}
#syn variants for background
syn.maf = maf@maf.silent
#number of samples in maf
numSamples = as.numeric(maf@summary[3,summary])
#Perform clustering and calculate background scores.
if(bgEstimate){
if(nrow(syn.maf) == 0){
message('No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)')
bg.mean = 0.279
bg.sd = 0.13
}else{
message('Estimating background scores from synonymous variants..')
syn.bg.scores = parse_prot(dat = syn.maf, AACol = AACol, gl, m = minMut, calBg = TRUE, nBg = nBgGenes)
#If number of genes to calculate background scores is not enough, use predefined scores.
if(is.null(syn.bg.scores)){
message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
bg.mean = 0.279
bg.sd = 0.13
}else {
if(nrow(syn.bg.scores) < nBgGenes){
message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
bg.mean = 0.279
bg.sd = 0.13
}else{
bg.mean = mean(syn.bg.scores$clusterScores)
bg.sd = sd(syn.bg.scores$clusterScores)
message(paste('Estimated background mean: ', bg.mean))
message(paste('Estimated background SD: ', bg.sd))
}
}
}
}else{
message("Using predefined values for background. (Mean = 0.279; SD = 0.13)")
bg.mean = 0.279
bg.sd = 0.13
}
#non-syn variants
non.syn.maf = maf@data
#Remove genes to ignore
if(!is.null(ignoreGenes)){
ignoreGenes.count = nrow(non.syn.maf[Hugo_Symbol %in% ignoreGenes])
message(paste('Removed', ignoreGenes.count, 'variants belonging to', paste(ignoreGenes, collapse = ', ', sep=',')))
non.syn.maf = non.syn.maf[!Hugo_Symbol %in% ignoreGenes]
}
#Perform clustering and calculate cluster scores for nonsyn variants.
message('Estimating cluster scores from non-syn variants..')
nonsyn.scores = parse_prot(dat = non.syn.maf, AACol = AACol, gl = gl, m = minMut, calBg = FALSE, nBg = nBgGenes)
if(pvalMethod == 'combined'){
message('Comapring with background model and estimating p-values..')
nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
nonsyn.scores$tPval = 1- pnorm(nonsyn.scores$zscore)
nonsyn.scores$tFdr = p.adjust(nonsyn.scores$tPval, method = 'fdr')
nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model
observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')
#Poisson test to caluclate difference (p-value)
nonsyn.scores$poissonPval = apply(nonsyn.scores, 1, function(x) {
poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
})
nonsyn.scores$poissonFdr = p.adjust(nonsyn.scores$poissonPval, method = 'fdr')
nonsyn.scores = nonsyn.scores[order(poissonFdr)]
nonsyn.scores$fdr = apply(nonsyn.scores[,.(tFdr, poissonFdr)], MARGIN = 1, FUN = min)
} else if(pvalMethod == 'zscore'){
#Oncodrive clust way of caluclating pvalues
#Calculate z scores; compare it to bg scores and estimate z-score, pvalues, corrected pvalues (fdr) (assumes normal distribution)
message('Comapring with background model and estimating p-values..')
nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
nonsyn.scores$pval = 1- pnorm(nonsyn.scores$zscore)
nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr')
nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
#nonsyn.scores[,fract_MutatedSamples := MutatedSamples/numSamples]
nonsyn.scores = nonsyn.scores[order(fdr)]
}else{
#Assuming poisson distribution of mutation counts
#Now model observed number of mutations as a function of number of clusters and protein length. Calculate expected number of events based on poisson distribution.
nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model
observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')
#Poisson test to caluclate difference (p-value)
nonsyn.scores$pval = apply(nonsyn.scores, 1, function(x) {
poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
})
nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr')
nonsyn.scores = nonsyn.scores[order(fdr)]
}
message('Done !')
return(nonsyn.scores)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.