VariationTable <- R6::R6Class("VariationTable",
public = list(
data = NULL,
summary = NULL,
summary_by_gene = NULL,
phenotype = NULL,
reference=NULL,
initialize = function(data = NULL,
summary = NULL,
phenotype = NULL,
reference=NULL) {
self$data = data
},
print = function(...) {
cat("A VariationTable object: \n")
},
get_gene_summary = function() {
}
)
)
#' Read summarized variation data as VariationTable object
#'
#' Read tab delimited MAF, CNV table and other variation files (can be plain text or gz compressed),
#' or [data.frame]s. Input data will be summarized in various ways.
#'
#' @param Maf tab delimited MAF file. File can also be gz compressed.
#' Alternatively, you can also provide already read MAF file as a [data.frame].
#' @param cnTable tab delimited file or [data.frame] representing CNV. It has 5 necessary columns:
#' Chromosome (either chr# or # format, supports chr(1-22,X,Y)), Start, End,
#' Copynumber (absolute copy number value) and Sample.
#' @param Phenotype tab delimited file or [data.frame] representing phenotype data.
#' 'Tumor_Sample_Barcode' or 'Sample' can be used as sample ID.
#' The 'Tumor_Sample_Barcode' will be first priority if both column names exist.
#' @param refSNV genome build version, should be 'hg19' or 'hg38'. If `NULL`, it will be
#' obtained from `Maf`, 37 in `NCBI_Build` will set 'hg19' and 38 in `NCBI_Build` will set 'hg38'.
#' @param refINDEL same as above.
#' @param refCNV genome build version, should be 'hg19' or 'hg38'. Default is 'hg19'.
#' @param nonSyn Provide manual list of variant classifications to be considered as non-synonymous.
#' Rest will be considered as silent variants.
#' Default uses Variant Classifications with High/Moderate variant consequences.
#' <http://asia.ensembl.org/Help/Glossary?id=535>: "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
#' "Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del",
#' "In_Frame_Ins", "Missense_Mutation"
#' @param segnum_min minimal number of copy number segments within a sample.
#' @param segnum_max maximal number of copy number segments within a sample.
#' @param use_whole_genome if `TRUE`, all genome regions for copy number data will be used.
#' Regions with no copy number value will be set to normal copy 2.
#' Otherwise, only regions in input data are used.
#' @param keep_level Keep level for input data columns, you can use 'all' to
#' keep all columns and use 'custom' to add columns except for required columns.
#' @param keep_extra_cols extra columns to keep when `keep_level` set to 'custom'.
#' @param verbose print extra messages if `TRUE`.
#' @return a `VariationTable` object
#' @export
#'
#' @examples
#' library(maftools)
#' laml.Maf <- system.file("extdata", "tcga_laml.Maf.gz", package = "maftools")
#' laml <- VT(Maf = laml.Maf)
VT = function(Maf=NULL, cnTable=NULL, Phenotype=NULL,
refSNV=NULL, refINDEL=refSNV, refCNV=c("hg19", "hg38"),
nonSyn = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
"Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation",
"In_Frame_Del","In_Frame_Ins", "Missense_Mutation"),
segnum_min=0, segnum_max=Inf,
use_whole_genome=FALSE,
keep_level = c('basic', 'all', 'custom'),
keep_extra_cols = NULL,
verbose=TRUE, ...) {
stopifnot(any(!is.null(Maf), !is.null(cnTable)))
refCNV = match.arg(refCNV)
reference = dplyr::tibble(SNV=ifelse(is.null(refSNV), NA_character_, refSNV),
INDEL=ifelse(is.null(refINDEL), NA_character_, refINDEL),
CNV=refCNV)
if (!is.null(Maf)) {
if (verbose) {
message("=> Reading mutation data...")
message("==> Reading...")
}
if(!is.data.frame(x = Maf)){
if(as.logical(length(grep(pattern = 'gz$', x = Maf, fixed = FALSE)))){
#If system is Linux use fread, else use gz connection to read gz file.
if(Sys.info()[['sysname']] == 'Windows'){
Maf.gz = gzfile(description = Maf, open = 'r')
suppressWarnings(Maf <- data.table::as.data.table(read.csv(file = Maf.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE, comment.char = "#")))
close(Maf.gz)
} else{
Maf = suppressWarnings(data.table::fread(cmd = paste('zcat <', Maf), sep = '\t', stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol"))
}
} else{
suppressWarnings(Maf <- data.table::fread(input = Maf, sep = "\t", stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol"))
}
}
if (verbose) message("==> Checking reference genome build version...")
if (is.null(refSNV)) {
if (Maf$NCBI_Build[1] == 38 | Maf$NCBI_Build[1] == "38") {
reference$SNV <- reference$INDEL <- 'hg38'
} else if (Maf$NCBI_Build[1] == 37 & Maf$NCBI_Build[1] == '37') {
reference$SNV <- reference$INDEL <- 'hg37'
} else {
stop("Only 37 or 38 is supported in Maf file, report to https://github.com/ShixiangWang/sigtools/issues for supporting new type.")
}
}
if (verbose) message("==> Validating required columns...")
required.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
"End_Position", "Reference_Allele", "Tumor_Seq_Allele2",
"Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode")
for (i in 1:length(required.fields)) {
colId = suppressWarnings(grep(pattern = paste("^", required.fields[i],
"$", sep = ""), x = colnames(Maf), ignore.case = TRUE))
if (length(colId) > 0) {
colnames(Maf)[colId] = required.fields[i]
}
}
missing.fileds = required.fields[!required.fields %in% colnames(Maf)]
if (length(missing.fileds) > 0) {
missing.fileds = paste(missing.fileds[1], sep = ",",
collapse = ", ")
stop(paste("missing required fields from Maf:", missing.fileds))
}
selected.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
"End_Position", "Reference_Allele", "Tumor_Seq_Allele2",
"Variant_Type", "Variant_Classification", "Variant_Label", "Tumor_Sample_Barcode")
to.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
"End_Position", "Reference_Allele", "Tumor_Seq_Allele",
"Variant_Type", "Variant_Classification", "Variant_Label", "Sample")
Maf = Maf %>%
dplyr::as_tibble() %>%
dplyr::mutate(Variant_Label = ifelse(.data$Variant_Classification %in% nonSyn,
"Nonsilent", "Silent"),
Variant_Type = ifelse(.data$Variant_Type == "SNP",
"SNV", .data$Variant_Type)) %>%
dplyr::select(selected.fields, dplyr::everything())
colnames(Maf)[1:length(to.fields)] = to.fields
if (verbose) message("==> Checking data keep level...")
keep_level = match.arg(keep_level)
if (keep_level == 'basic') {
Maf = Maf[, to.fields]
} else if (keep_level == 'custom') {
if (!is.null(keep_extra_cols)) {
if (all(keep_extra_cols %in% colnames(Maf))) {
Maf = Maf[, c(to.fields, keep_extra_cols)]
} else {
stop("Some columns provided by keep_extra_cols are not in data, please check it!")
}
}
}
if (verbose) message("=> Done")
}
if (!is.null(cnTable)) {
if (verbose) message("=> Reading copy number data...")
if (verbose) message("=> Done")
}
if (!is.null(Phenotype)) {
if (verbose) message("=> Reading phenotype data...")
if (verbose) message("=> Done")
}
return(VariationTable$new(data = Maf))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.