Nothing
#' Read MAF files.
#' @description Takes tab delimited MAF (can be plain text or gz compressed) file as an input and summarizes it in various ways. Also creates oncomatrix - helpful for visualization.
#' @details This function takes MAF file as input and summarizes them. If copy number data is available, e.g from GISTIC, it can be provided too via arguments gisticAllLesionsFile, gisticAmpGenesFile,
#' and gisticDelGenesFile. Copy number data can also be provided as a custom table containing Gene name, Sample name and Copy Number status.
#'
#' Note that if input MAF file contains multiple affected transcripts of a variant, this function by default removes them as duplicates, while keeping single unique entry per variant per sample.
#' If you wish to keep all of them, set removeDuplicatedVariants to FALSE.
#'
#'FLAGS - If you get a note on possible FLAGS while reading MAF, it means some of the top mutated genes are fishy. These genes are often non-pathogenic and passengers, but are frequently mutated in most of the public exome studies. Examples of such genes include TTN, MUC16, etc.
#'This note can be ignored without any harm, it's only generated as to make user aware of such genes. See references for details on FLAGS.
#'
#'@references Shyr C, Tarailo-Graovac M, Gottlieb M, Lee JJ, van Karnebeek C, Wasserman WW. FLAGS, frequently mutated genes in public exomes. BMC Med Genomics 2014; 7: 64.
#'
#' @param maf tab delimited MAF file. File can also be gz compressed. Required. Alternatively, you can also provide already read MAF file as a dataframe.
#' @param clinicalData Clinical data associated with each sample/Tumor_Sample_Barcode in MAF. Could be a text file or a data.frame. Default NULL.
#' @param useAll logical. Whether to use all variants irrespective of values in Mutation_Status. Defaults to TRUE. If FALSE, only uses with values Somatic.
#' @param gisticAllLesionsFile All Lesions file generated by gistic. e.g; all_lesions.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticAmpGenesFile Amplification Genes file generated by gistic. e.g; amp_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticDelGenesFile Deletion Genes file generated by gistic. e.g; del_genes.conf_XX.txt, where XX is the confidence level. Default NULL.
#' @param gisticScoresFile scores.gistic file generated by gistic. Default NULL
#' @param cnLevel level of CN changes to use. Can be 'all', 'deep' or 'shallow'. Default uses all i.e, genes with both 'shallow' or 'deep' CN changes
#' @param cnTable Custom copynumber data if gistic results are not available. Input file or a data.frame should contain three columns in aforementioned order with gene name, Sample name and copy number status (either 'Amp' or 'Del'). Default NULL.
#' @param isTCGA Is input MAF file from TCGA source. If TRUE uses only first 12 characters from Tumor_Sample_Barcode.
#' @param removeDuplicatedVariants removes repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. See Description. Default TRUE.
#' @param vc_nonSyn NULL. Provide manual list of variant classifications to be considered as non-synonymous. Rest will be considered as silent variants. Default uses Variant Classifications with High/Moderate variant consequences. http://asia.ensembl.org/Help/Glossary?id=535: "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del","In_Frame_Ins", "Missense_Mutation"
#' @param verbose TRUE logical. Default to be talkative and prints summary.
#' @return An object of class MAF.
#' @examples
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml <- read.maf(maf = laml.maf)
#' @import data.table
#' @seealso \code{\link{plotmafSummary}} \code{\link{write.mafSummary}}
#' @export
read.maf = function(maf, clinicalData = NULL, removeDuplicatedVariants = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL,
gisticDelGenesFile = NULL, gisticScoresFile = NULL, cnLevel = 'all', cnTable = NULL, isTCGA = FALSE, vc_nonSyn = NULL, verbose = TRUE){
#1. Read MAF if its a file or convert to data.table if its data.frame
start_time = proc.time()
if (is.data.frame(x = maf)) {
maf = data.table::as.data.table(maf)
} else{
if (verbose) {
cat('-Reading\n')
}
maf <-
data.table::fread(
file = maf,
sep = "\t",
stringsAsFactors = FALSE,
verbose = FALSE,
data.table = TRUE,
showProgress = TRUE,
header = TRUE,
fill = TRUE,
skip = "Hugo_Symbol",
quote = ""
)
# if(as.logical(length(grep(pattern = 'gz$', x = maf, fixed = FALSE)))){
# #If system is Linux use fread, else use gz connection to read gz file.
# if(Sys.info()[['sysname']] == 'Windows'){
# maf.gz = gzfile(description = maf, open = 'r')
# suppressWarnings(maf <- data.table::as.data.table(read.csv(file = maf.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE, comment.char = "#")))
# close(maf.gz)
# } else{
# maf = suppressWarnings(data.table::fread(cmd = paste('zcat <', maf), sep = '\t', stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol", quote = ""))
# }
# } else{
# suppressWarnings(maf <- data.table::fread(input = maf, sep = "\t", stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol", quote = ""))
# }
}
#2. validate MAF file
if(verbose){
cat("-Validating\n")
}
maf = validateMaf(maf = maf, isTCGA = isTCGA, rdup = removeDuplicatedVariants, chatty = verbose)
#3. validation check for variants classified as Somatic in Mutation_Status field.
if(!useAll){
cat('--Using only `Somatic` variants from Mutation_Status. Set useAll = TRUE to include everything.')
if(length(colnames(maf)[colnames(x = maf) %in% 'Mutation_Status']) > 0){
maf = maf[Mutation_Status %in% "Somatic"]
if(nrow(maf) == 0){
stop('No more Somatic mutations left after filtering for Mutation_Status! Maybe set useAll to TRUE ?')
}
}else{
cat('Mutation_Status not found. Assuming all variants are Somatic and validated\n')
}
}
#4. Seperate synonymous variants from non-syn variants
#Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
if(is.null(vc_nonSyn)){
vc.nonSilent = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site",
"Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del",
"In_Frame_Ins", "Missense_Mutation")
}else{
vc.nonSilent = vc_nonSyn
}
# silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
# "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "De_novo_Start_InFrame", "De_novo_Start_OutOfFrame", "Start_Codon_Ins", "Start_Codon_SNP", "Stop_Codon_Del")
#Variant Classification with High/Moderate variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
maf.silent = maf[!Variant_Classification %in% vc.nonSilent] #Silent variants
if(nrow(maf.silent) > 0){
maf.silent.vc = maf.silent[,.N, .(Tumor_Sample_Barcode, Variant_Classification)]
maf.silent.vc.cast = data.table::dcast(data = maf.silent.vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N') #why dcast is not returning it as data.table ?
summary.silent = data.table::data.table(ID = c('Samples',colnames(maf.silent.vc.cast)[2:ncol(maf.silent.vc.cast)]),
N = c(nrow(maf.silent.vc.cast), colSums(maf.silent.vc.cast[,2:ncol(maf.silent.vc.cast), with = FALSE])))
maf = maf[Variant_Classification %in% vc.nonSilent] #Choose only non-silent variants from main table
if(verbose){
cat(paste0('-Silent variants: ', nrow(maf.silent)), '\n')
#print(summary.silent)
}
}
if(nrow(maf) == 0){
stop("No non-synonymous mutations found\nCheck `vc_nonSyn`` argumet in `read.maf` for details")
}
#5. Process CN data if available.
if(!is.null(gisticAllLesionsFile)){
gisticIp = readGistic(gisticAllLesionsFile = gisticAllLesionsFile, gisticAmpGenesFile = gisticAmpGenesFile,
gisticDelGenesFile = gisticDelGenesFile, isTCGA = isTCGA, gisticScoresFile = gisticScoresFile, cnLevel = cnLevel, verbose = verbose)
gisticIp = gisticIp@data
suppressWarnings(gisticIp[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')])
gisticIp = gisticIp[!duplicated(id)]
gisticIp[,id := NULL]
maf = data.table::rbindlist(list(maf, gisticIp), fill = TRUE, use.names = TRUE)
maf$Tumor_Sample_barcode = factor(x = maf$Tumor_Sample_barcode,
levels = unique(c(levels(maf$Tumor_Sample_barcode), unique(as.character(gisticIp$Tumor_Sample_barcode)))))
#oncomat = createOncoMatrix(maf, chatty = verbose)
}else if(!is.null(cnTable)){
if(verbose){
cat('-Processing copy number data\n')
}
if(is.data.frame(cnTable)){
cnDat = data.table::copy(cnTable)
data.table::setDT(x = cnDat)
}else{
cnDat = data.table::fread(input = cnTable, sep = '\t', stringsAsFactors = FALSE, header = TRUE, colClasses = 'character')
}
colnames(cnDat) = c('Hugo_Symbol', 'Tumor_Sample_Barcode', 'Variant_Classification')
if(isTCGA){
cnDat[,Tumor_Sample_Barcode := substr(x = cnDat$Tumor_Sample_Barcode, start = 1, stop = 12)]
}
cnDat$Variant_Type = 'CNV'
suppressWarnings(cnDat[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')])
cnDat = cnDat[!duplicated(id)]
cnDat[,id := NULL]
maf = data.table::rbindlist(l = list(maf, cnDat), fill = TRUE, use.names = TRUE)
maf$Tumor_Sample_barcode = factor(x = maf$Tumor_Sample_barcode,
levels = unique(c(levels(maf$Tumor_Sample_barcode), unique(as.character(cnDat$Tumor_Sample_barcode)))))
}
#Set factors
maf$Variant_Classification = as.factor(as.character(maf$Variant_Classification))
maf$Variant_Type = as.factor(as.character(maf$Variant_Type))
if(verbose){
cat('-Summarizing\n')
}
mafSummary = summarizeMaf(maf = maf, anno = clinicalData, chatty = verbose)
#7. Create MAF object
m = MAF(data = maf, variants.per.sample = mafSummary$variants.per.sample, variant.type.summary = mafSummary$variant.type.summary,
variant.classification.summary = mafSummary$variant.classification.summary, gene.summary = mafSummary$gene.summary,
summary = mafSummary$summary, maf.silent = maf.silent, clinical.data = mafSummary$sample.anno)
#m = mafSetKeys(maf = m)
if(verbose){
cat("-Finished in",data.table::timetaken(start_time),"\n")
}
return(m)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.