R/05_realignment.R

Defines functions gatk.realign gatk.targetcreator

Documented in gatk.realign gatk.targetcreator

#' @title gatk.targetcreator
#' @description A wrapper function to run GATK (RealignerTargetCreator)
#' @usage gatk.targetcreator(fns.bam, output.dir, sample.name, ref.fa, ref.indels, unsafe=FALSE, run.cmd=TRUE, mc.cores=1)
#' @param fns.bam Path to BAM files
#' @param output.dir Output directory
#' @param sample.name A character vector for the sample names
#' @param ref.fa Reference fasta file
#' @param ref.gold_indels Known sites to indel variant call format(VCF)
#' @param unsafe A parameter value for -U ALLOW_N_CIGAR_READS in GATK. This parameter must be TRUE in RNA-seq data. 
#' @param run.cmd Whether to execute the command line (default=TRUE)
#' @param mc.cores The number of cores to use. Must be at least one(default=1), and parallelization requires at least two cores.
#' @details Define insertion and deletion intervals to target for local realignment
#' @return Interval file included indel positions (e.g., .intervals)
#' @import parallel
#' @references The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data.
#' @seealso \url{https://software.broadinstitute.org/gatk/}
#' @export
gatk.targetcreator=function(fns.bam, 
                            output.dir, 
                            sample.name, 
                            
                            # ref
                            ref.fa, 
                            ref.gold_indels,
                            
                            unsafe=FALSE,
                            
                            gatk.thread.number=4,
                            run.cmd=TRUE,
                            mc.cores=1){
  
  out.dirs=file.path(output.dir, sample.name)
  lapply(1:length(out.dirs), function(a) dir.create(out.dirs[a], recursive = TRUE, showWarnings=FALSE))
  out.fns= file.path(out.dirs, paste0(sample.name, ".intervals"))
  
  message("[[",Sys.time(),"]] Run TargetCreator -------") 
  
  cmd.add=ifelse(unsafe, "-U ALLOW_N_CIGAR_READS", "")
  cmd=paste("java -jar", GATK.path, "-T RealignerTargetCreator",  "-I", fns.bam, "-R", ref.fa, "-o", out.fns, "-known", ref.gold_indels, "-nt", gatk.thread.number, cmd.add)
  print_message(cmd)
  
  if(run.cmd) mclapply(cmd, system,mc.cores=mc.cores)
  cat(cmd, file=file.path(output.dir, "targetcreator.run.log"), sep = "\n", append = FALSE)
  
  out.fns
}




#' @title gatk.realign
#' @description A wrapper function to run GATK (IndelRealigner)
#' @usage gatk.realign(fn.rmdu.bam, fn.intervals, output.dir, fn.realign.bam, ref.fa, ref.indels, unsafe=FALSE, run.cmd=TRUE, mc.cores=1)
#' @param fns.bam Path to input BAM files 
#' @param fns.interval Interval list file created by gatk.targetcreator
#' @param output.dir Output directory
#' @param sample.name A character vector for the sample names
#' @param ref.fa Reference fasta file
#' @param ref.gold_indels Known sites to indel variant call format(VCF)
#' @param unsafe A parameter value for -U ALLOW_N_CIGAR_READS in GATK. This parameter must be TRUE in RNA-seq data. 
#' @param run.cmd Whether to execute the command line (default=TRUE)
#' @param mc.cores The number of cores to use. Must be at least one(default=1), and parallelization requires at least two cores.
#' @details Perform local realignment of reads around indels. IndelRealigner takes a coordinate-sorted and indexed BAM and a target 
#'          intervals file generated by RealignerTargetCreator. IndelRealigner then performs local realignment on reads coincident with 
#'          the target intervals using consensus from indels present in the original alignment.
#' @return GATK IndelRealigner returns a Realigned bam file (e.g., realign.bam)
#' @import parallel
#' @references The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data.
#' @seealso \url{https://software.broadinstitute.org/gatk/}
#' @export
gatk.realign=function(fns.bam, 
                      fns.interval,
                      output.dir, 
                      sample.name,
                      
                      # ref
                      ref.fa, 
                      ref.gold_indels,
                      
                      unsafe=FALSE,
                      run.cmd=TRUE,
                      mc.cores=1){
  
  out.dirs=file.path(output.dir, sample.name)
  lapply(1:length(out.dirs), function(a) dir.create(out.dirs[a], recursive = TRUE, showWarnings=FALSE))
  out.fns= file.path(out.dirs, paste0(sample.name, ".realign.bam"))
  
  message("[[",Sys.time(),"]] Run reagliner -------")
  
  cmd.add=ifelse(unsafe, "-U ALLOW_N_CIGAR_READS", "")
  cmd=paste("java -jar", GATK.path, "-T IndelRealigner", "-targetIntervals", fns.interval, "-I", fns.bam, "-R", ref.fa, "-o", out.fns, "-known", ref.gold_indels, cmd.add)
  
  print_message(cmd)
  
  if(run.cmd) mclapply(cmd, system,mc.cores=mc.cores)
  cat(cmd, file=file.path(output.dir, "indelrealigner.run.log"), sep = "\n", append = FALSE)
  
  out.fns
}
omicsCore/SEQprocess documentation built on May 7, 2020, 4:18 a.m.