R/genSnpZygosity.R

Defines functions genSnpZygosity

Documented in genSnpZygosity

#' Simplifies the SNP Zygosity from MuTect
#' 
#' @description Takes a single sample Mutect VCF that has force called all dbSNP
#' sites and reduces it to a single column of 1,2,3 values corresponding to 
#' REF.HOMOZYGOUS, HETEROZYGOUS, and ALT.HOMOZYGOUS respectively using Perl. 
#' If multiple samples are given, it will concatenate the samples into a single matrix.
#' 
#' @param sampleid Vector of sample IDs for the filename [chr].[sampleid].vcf
#' @param chrs Chrs for the filename [chr].[sampleid].vcf
#' @param caller Either 'Mutect' or 'GATK' (collectAllelicCounts)
#' 
#' @return
#' Matrix of parsed output values
#' @export
genSnpZygosity <- function(sampleid, caller='GATK',
                           chrs = paste0("chr", c(1:22, "X", "Y"))){
  # sampleid <- 'NET-2-001b_02_T_DNA.processed'
  sampleid_dir <- file.path("tmp", gsub("[^a-zA-Z0-9]", "", sampleid))
  dir.create(sampleid_dir, recursive = TRUE, showWarnings = FALSE)
  
  self_pkg_name <- 'WadingPool'
  bin_dir <- system.file("bin/", package = self_pkg_name)
  
  categorize_results <- sapply(chrs, function(chr){
    if(grepl("^gatk$", caller, ignore.case = T)){
      tsv_file <- paste0(chr, ".", sampleid, ".allelicCounts.tsv")
      out_file <- file.path(sampleid_dir, paste0(chr, ".", sampleid, "_out.tsv"))
      
      ## Executes the categorizeAD.sh shell script that parses the single-sample
      #  MuTect VCF output 
      categorize_cmd <- paste0("sh ", bin_dir, "/categorizeAD_GATK.sh", 
                               " -t ", tsv_file, 
                               " -o ", out_file)
      categorize_res <- try(system(command = categorize_cmd, intern = TRUE))
    } else if(grepl("^mutect$", caller, ignore.case = T)){
      vcf_file <- paste0(chr, ".", sampleid, ".vcf")
      out_file <- file.path(sampleid_dir, paste0(chr, ".", sampleid, "_out.tsv"))
      
      ## Executes the categorizeAD.sh shell script that parses the single-sample
      #  MuTect VCF output 
      categorize_cmd <- paste0("sh ", bin_dir, "/categorizeAD.sh", 
                               " -v ", vcf_file, 
                               " -o ", out_file)
      categorize_res <- try(system(command = categorize_cmd, intern = TRUE))
    }
    
    return(c("Time"=as.numeric(parseOutput(categorize_res, "Runtime")),
             "In"=as.character(parseOutput(categorize_res, "vcf")),
             "Out"=as.character(parseOutput(categorize_res, "out"))))
  })
  categorize_results <- t(categorize_results)
  
}
quevedor2/WadingPool documentation built on Dec. 22, 2021, 10:59 a.m.