R/class_VariationTable.R

Defines functions VT

Documented in VT

VariationTable <- R6::R6Class("VariationTable",
                          public = list(
                            data = NULL,
                            summary = NULL,
                            summary_by_gene = NULL,
                            phenotype = NULL,
                            reference=NULL,
                            initialize = function(data = NULL,
                                                  summary = NULL,
                                                  phenotype = NULL,
                                                  reference=NULL) {
                              self$data = data
                            },
                            print = function(...) {
                              cat("A VariationTable object: \n")
                            },
                            get_gene_summary = function() {

                            }
                          )
)


#' Read summarized variation data as VariationTable object
#'
#' Read tab delimited MAF, CNV table and other variation files (can be plain text or gz compressed),
#' or [data.frame]s. Input data will be summarized in various ways.
#'
#' @param Maf tab delimited MAF file. File can also be gz compressed.
#' Alternatively, you can also provide already read MAF file as a [data.frame].
#' @param cnTable tab delimited file or [data.frame] representing CNV. It has 5 necessary columns:
#' Chromosome (either chr# or # format, supports chr(1-22,X,Y)), Start, End,
#' Copynumber (absolute copy number value) and Sample.
#' @param Phenotype tab delimited file or [data.frame] representing phenotype data.
#' 'Tumor_Sample_Barcode' or 'Sample' can be used as sample ID.
#' The 'Tumor_Sample_Barcode' will be first priority if both column names exist.
#' @param refSNV genome build version, should be 'hg19' or 'hg38'. If `NULL`, it will be
#' obtained from `Maf`, 37 in `NCBI_Build` will set 'hg19' and 38 in `NCBI_Build` will set 'hg38'.
#' @param refINDEL same as above.
#' @param refCNV genome build version, should be 'hg19' or 'hg38'. Default is 'hg19'.
#' @param nonSyn Provide manual list of variant classifications to be considered as non-synonymous.
#' Rest will be considered as silent variants.
#' Default uses Variant Classifications with High/Moderate variant consequences.
#' <http://asia.ensembl.org/Help/Glossary?id=535>: "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
#' "Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del",
#' "In_Frame_Ins", "Missense_Mutation"
#' @param segnum_min minimal number of copy number segments within a sample.
#' @param segnum_max maximal number of copy number segments within a sample.
#' @param use_whole_genome if `TRUE`, all genome regions for copy number data will be used.
#' Regions with no copy number value will be set to normal copy 2.
#' Otherwise, only regions in input data are used.
#' @param keep_level Keep level for input data columns, you can use 'all' to
#' keep all columns and use 'custom' to add columns except for required columns.
#' @param keep_extra_cols extra columns to keep when `keep_level` set to 'custom'.
#' @param verbose print extra messages if `TRUE`.
#' @return a `VariationTable` object
#' @export
#'
#' @examples
#' library(maftools)
#' laml.Maf <- system.file("extdata", "tcga_laml.Maf.gz", package = "maftools")
#' laml <- VT(Maf = laml.Maf)
VT = function(Maf=NULL, cnTable=NULL, Phenotype=NULL,
              refSNV=NULL, refINDEL=refSNV, refCNV=c("hg19", "hg38"),
              nonSyn = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
                         "Translation_Start_Site","Nonsense_Mutation", "Nonstop_Mutation",
                         "In_Frame_Del","In_Frame_Ins", "Missense_Mutation"),
              segnum_min=0, segnum_max=Inf,
              use_whole_genome=FALSE,
              keep_level = c('basic', 'all', 'custom'),
              keep_extra_cols = NULL,
              verbose=TRUE, ...) {
  stopifnot(any(!is.null(Maf), !is.null(cnTable)))

  refCNV = match.arg(refCNV)

  reference = dplyr::tibble(SNV=ifelse(is.null(refSNV), NA_character_, refSNV),
                            INDEL=ifelse(is.null(refINDEL), NA_character_, refINDEL),
                            CNV=refCNV)

  if (!is.null(Maf)) {
    if (verbose) {
      message("=> Reading mutation data...")
      message("==> Reading...")
    }

    if(!is.data.frame(x = Maf)){
      if(as.logical(length(grep(pattern = 'gz$', x = Maf, fixed = FALSE)))){
        #If system is Linux use fread, else use gz connection to read gz file.
        if(Sys.info()[['sysname']] == 'Windows'){
          Maf.gz = gzfile(description = Maf, open = 'r')
          suppressWarnings(Maf <- data.table::as.data.table(read.csv(file = Maf.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE, comment.char = "#")))
          close(Maf.gz)
        } else{
          Maf = suppressWarnings(data.table::fread(cmd = paste('zcat <', Maf), sep = '\t', stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol"))
        }
      } else{
        suppressWarnings(Maf <- data.table::fread(input = Maf, sep = "\t", stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE, fill = TRUE, skip = "Hugo_Symbol"))
      }
    }

    if (verbose) message("==> Checking reference genome build version...")

    if (is.null(refSNV)) {
      if (Maf$NCBI_Build[1] == 38 | Maf$NCBI_Build[1] == "38") {
        reference$SNV <- reference$INDEL <- 'hg38'
      } else if (Maf$NCBI_Build[1] == 37 & Maf$NCBI_Build[1] == '37') {
        reference$SNV <- reference$INDEL <- 'hg37'
      } else {
        stop("Only 37 or 38 is supported in Maf file, report to https://github.com/ShixiangWang/sigtools/issues for supporting new type.")
      }
    }

    if (verbose) message("==> Validating required columns...")

    required.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
                        "End_Position", "Reference_Allele", "Tumor_Seq_Allele2",
                        "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode")
    for (i in 1:length(required.fields)) {
      colId = suppressWarnings(grep(pattern = paste("^", required.fields[i],
                                                    "$", sep = ""), x = colnames(Maf), ignore.case = TRUE))
      if (length(colId) > 0) {
        colnames(Maf)[colId] = required.fields[i]
      }
    }
    missing.fileds = required.fields[!required.fields %in% colnames(Maf)]
    if (length(missing.fileds) > 0) {
      missing.fileds = paste(missing.fileds[1], sep = ",",
                             collapse = ", ")
      stop(paste("missing required fields from Maf:", missing.fileds))
    }


    selected.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
                        "End_Position", "Reference_Allele", "Tumor_Seq_Allele2",
                        "Variant_Type", "Variant_Classification", "Variant_Label", "Tumor_Sample_Barcode")
    to.fields = c("Hugo_Symbol", "Chromosome", "Start_Position",
                  "End_Position", "Reference_Allele", "Tumor_Seq_Allele",
                  "Variant_Type", "Variant_Classification", "Variant_Label", "Sample")

    Maf = Maf %>%
      dplyr::as_tibble() %>%
      dplyr::mutate(Variant_Label = ifelse(.data$Variant_Classification %in% nonSyn,
                                           "Nonsilent", "Silent"),
                    Variant_Type = ifelse(.data$Variant_Type == "SNP",
                                          "SNV", .data$Variant_Type)) %>%
      dplyr::select(selected.fields, dplyr::everything())

    colnames(Maf)[1:length(to.fields)] = to.fields

    if (verbose) message("==> Checking data keep level...")

    keep_level = match.arg(keep_level)

    if (keep_level == 'basic') {
      Maf = Maf[, to.fields]
    } else if (keep_level == 'custom') {
      if (!is.null(keep_extra_cols)) {
        if (all(keep_extra_cols %in% colnames(Maf))) {
          Maf = Maf[, c(to.fields, keep_extra_cols)]
        } else {
          stop("Some columns provided by keep_extra_cols are not in data, please check it!")
        }

      }
    }

    if (verbose) message("=> Done")
  }

  if (!is.null(cnTable)) {
    if (verbose) message("=> Reading copy number data...")

    if (verbose) message("=> Done")
  }

  if (!is.null(Phenotype)) {
    if (verbose) message("=> Reading phenotype data...")

    if (verbose) message("=> Done")
  }

  return(VariationTable$new(data = Maf))
}
ShixiangWang/sigtools documentation built on July 17, 2019, 9:54 p.m.