R/read_vcf.R

Defines functions read_xena_variants read_vcf

Documented in read_vcf read_xena_variants

#' Read VCF Files as MAF Object
#'
#' MAF file is more recommended. In this function, we will mimic
#' the MAF object from the key `c(1, 2, 4, 5, 7)` columns of VCF file.
#'
#' @param vcfs VCF file paths.
#' @param samples sample names for VCF files.
#' @param genome_build genome build version like "hg19".
#' @param keep_only_pass if `TRUE`, keep only 'PASS' mutation for analysis.
#' @param verbose if `TRUE`, print extra info.
#'
#' @return a [MAF].
#' @export
#' @seealso [read_maf], [read_copynumber]
#' @examples
#' vcfs <- list.files(system.file("extdata", package = "sigminer"), "*.vcf", full.names = TRUE)
#' \donttest{
#' maf <- read_vcf(vcfs)
#' maf <- read_vcf(vcfs, keep_only_pass = TRUE)
#' }
#' @testexamples
#' expect_is(maf, "MAF")
read_vcf <- function(vcfs, samples = NULL,
                     genome_build = c("hg19", "hg38", "T2T", "mm10", "mm9", "ce11"),
                     keep_only_pass = FALSE, verbose = TRUE) {
  genome_build <- match.arg(genome_build)
  vcfs_name <- vcfs
  if (verbose) message("Reading file(s): ", paste(vcfs, collapse = ", "))
  vcfs <- purrr::map(vcfs, ~ tryCatch(
    data.table::fread(., select = c(1, 2, 4, 5, 7), skip = "#CHROM"),
    error = function(e) {
      message("It seems ", ., " has no normal VCF header, try parsing without header.")
      data.table::fread(., select = c(1, 2, 4, 5, 7))
    }
  ))

  if (is.null(samples)) {
    names(vcfs) <- file_name(vcfs_name, must_chop = ".vcf")
  } else {
    if (length(samples) != length(vcfs_name)) {
      stop("Unequal files and samples!")
    }
    names(vcfs) <- samples
  }

  vcfs <- data.table::rbindlist(vcfs, use.names = FALSE, idcol = "sample")
  colnames(vcfs) <- c("Tumor_Sample_Barcode", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "filter")
  if (!is.character(vcfs$Chromosome[1])) {
    vcfs$Chromosome <- as.character(vcfs$Chromosome)
  }

  if (keep_only_pass) {
    vcfs <- vcfs[vcfs$filter == "PASS", ]
    if (nrow(vcfs) < 1L) {
      stop("No mutation left after filtering.")
    }
  }
  vcfs$filter <- NULL

  vcfs$Chromosome <- ifelse(startsWith(vcfs$Chromosome, "chr"),
    vcfs$Chromosome,
    paste0("chr", vcfs$Chromosome)
  )

  vcfs$End_Position <- vcfs$Start_Position + nchar(vcfs$Reference_Allele) - 1L

  if (verbose) message("Annotating Variant Type...")
  vcfs$Variant_Type <- dplyr::case_when(
    nchar(vcfs$Reference_Allele) == 1L & nchar(vcfs$Tumor_Seq_Allele2) == 1L ~ "SNP",
    nchar(vcfs$Reference_Allele) < nchar(vcfs$Tumor_Seq_Allele2) ~ "INS",
    nchar(vcfs$Reference_Allele) > nchar(vcfs$Tumor_Seq_Allele2) ~ "DEL",
    nchar(vcfs$Reference_Allele) == 2L & nchar(vcfs$Tumor_Seq_Allele2) == 2L ~ "DNP",
    nchar(vcfs$Reference_Allele) == 3L & nchar(vcfs$Tumor_Seq_Allele2) == 3L ~ "TNP",
    TRUE ~ "Unknown"
  )

  vcfs$Variant_Classification <- "Unknown"
  vcfs$Hugo_Symbol <- "Unknown"
  vcfs$Gene_ID <- "Unknown"

  # Annotate gene symbol
  gene_dt <- get_genome_annotation("gene", genome_build = genome_build)

  if (verbose) message("Annotating mutations to first matched gene based on database of ", genome_build, "...")
  dt <- gene_dt[, c("chrom", "start", "end", "gene_name", "gene_id")]
  data.table::setkey(dt, "chrom", "start", "end")

  match_dt <- data.table::foverlaps(vcfs, dt,
    by.x = c("Chromosome", "Start_Position", "End_Position"),
    which = TRUE, nomatch = NULL
  )
  ## Keep only the first gene index
  match_dt <- match_dt[!duplicated(match_dt$xid)]
  vcfs$Hugo_Symbol[match_dt$xid] <- dt$gene_name[match_dt$yid]
  vcfs$Gene_ID[match_dt$xid] <- dt$gene_id[match_dt$yid]

  if (verbose) message("Transforming into a MAF object...")
  maftools::read.maf(
    vcfs,
    clinicalData = NULL,
    removeDuplicatedVariants = TRUE,
    useAll = TRUE,
    gisticAllLesionsFile = NULL,
    gisticAmpGenesFile = NULL,
    gisticDelGenesFile = NULL,
    gisticScoresFile = NULL,
    cnLevel = "all",
    cnTable = NULL,
    isTCGA = FALSE,
    vc_nonSyn = "Unknown",
    verbose = verbose
  )
}

#' Read UCSC Xena Variant Format Data as MAF Object
#'
#' @param path a path to variant file.
#'
#' @return a `MAF` object.
#' @export
#'
#' @examples
#' \donttest{
#' if (requireNamespace("UCSCXenaTools")) {
#'   library(UCSCXenaTools)
#'   options(use_hiplot = TRUE)
#'   example_file <- XenaGenerate(subset = XenaDatasets == "mc3/ACC_mc3.txt") %>%
#'     XenaQuery() %>%
#'     XenaDownload()
#'   x <- read_xena_variants(example_file$destfiles)
#'   x@data
#'   y <- sig_tally(x)
#'   y
#' }
#' }
#' @testexamples
#' if (requireNamespace("UCSCXenaTools")) {
#'   expect_is(y, "list")
#' }
read_xena_variants <- function(path) {
  dt <- data.table::fread(path)

  detect_name <- function(x, y, z) {
    if (x %in% z) x else y
  }

  data.table::setnames(
    dt,
    old = c(
      detect_name("Sample_ID", "sample", colnames(dt)),
      "gene",
      detect_name("chrom", "chr", colnames(dt)),
      "start", "end",
      detect_name("ref", "reference", colnames(dt)),
      "alt"
    ),
    new = c(
      "Tumor_Sample_Barcode", "Hugo_Symbol", "Chromosome",
      "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2"
    )
  )

  dt$Variant_Type <- dplyr::case_when(
    nchar(dt$Reference_Allele) == 1L & nchar(dt$Tumor_Seq_Allele2) == 1L ~ "SNP",
    nchar(dt$Reference_Allele) < nchar(dt$Tumor_Seq_Allele2) ~ "INS",
    nchar(dt$Reference_Allele) > nchar(dt$Tumor_Seq_Allele2) ~ "DEL",
    nchar(dt$Reference_Allele) == 2L & nchar(dt$Tumor_Seq_Allele2) == 2L ~ "DNP",
    nchar(dt$Reference_Allele) == 3L & nchar(dt$Tumor_Seq_Allele2) == 3L ~ "TNP",
    TRUE ~ "Unknown"
  )
  dt$Variant_Classification <- "Unknown"
  dt$Hugo_Symbol <- "Unknown"

  maftools::read.maf(
    dt,
    clinicalData = NULL,
    removeDuplicatedVariants = TRUE,
    useAll = TRUE,
    gisticAllLesionsFile = NULL,
    gisticAmpGenesFile = NULL,
    gisticDelGenesFile = NULL,
    gisticScoresFile = NULL,
    cnLevel = "all",
    cnTable = NULL,
    isTCGA = FALSE,
    vc_nonSyn = "Unknown",
    verbose = FALSE
  )
}
ShixiangWang/sigminer documentation built on March 16, 2024, 12:30 p.m.