R/callsnvs.R

Defines functions callsnvs

Documented in callsnvs

#' Call somatic SNVs by using both global and local error models
#' @param sample.info.file The sample info file listing CASE and CONTROL samples. The format is simply 5 columns, tab-delimited, and there is no column header.
#' @param targetbed Genomic regions in the BED tab-delimited format.
#' @param pacbamfolder_bychrom The folder popluted by outputs by the \code{split_pacbam_bychrom} function.
#' @param pbem_dir The folder with outputs generated by the \code{compute_pbem} function. default: file.path(outdir, "BaseErrorModel")
#' @param controls_dir The folder with outputs generated by the \code{compute_afthreshold} function. default: file.path(outdir, "Controls")
#' @param detection.specificity The quantile of the GSE distribution(s) to use to compute the AF threhold. default: 0.995
#' @param replicas Replics sampling to define stability of AF threhold in bins of coverage. default: 1000
#' @param replicas.in.parallel default: 1
#' @param coeffvar.threshold Consider a bin as stable if the coefficient of variations after \code{replicas} is lower than the \code{coeffvar.threshold}. default: 0.01
#' @param AFbycov Apply coverage-based AF threhold. default: TRUE
#' @param mincov Minimum locus coverage in CASE sample. default: 10
#' @param mincovgerm Minimum locus coverage in CONTROL sample. default: 10
#' @param minalt Minimum number of reads supporting the alternative allele in CASE sample. default: 1
#' @param maxafgerm Maximum allelic fraction observed in matched CONTROL locus. default: 0.2
#' @param coverage_binning Bins of coverage into which divide AFs. default: 50
#' @param outdir The folder where outputs will be saved.
#' @param outdir.calls.name The subfolder name that will be created in the \code{outdir}. default: "Results"
#' @param chrom.in.parallel Number of chromosomes to run in parallel. default: 1
#' @return The \code{callsnvs} will create a subfolder for each CASE sample within the \code{outdir.calls.name} folder. There tab-delimeted files reporting SNV calls passing 3 consecutive filtering steps (\code{pmtab_F1}, \code{pmtab_F2} and \code{pmtab_F3}) are saved.
#' @return The function also return the data.frame \code{tabsnvs_index} reeporting for each sample the path to each of the tables generated during the 3 filtering steps.
#' @export
#' @examples
#' sample.info.file <- system.file("extdata", "test_sif_toy.tsv", package = "abemus")
#' outdir <- tempdir()
#' targetbed <- system.file("extdata", "regions_toy.bed", package = "abemus")
#' pacbamfolder_bychrom <- system.file("extdata", "pacbam_data_bychrom", package = "abemus")
#' pbem_dir <- system.file("extdata", "BaseErrorModel", package = "abemus")
#' controls_dir <- system.file("extdata", "Controls", package = "abemus")
#' callsnvs(sample.info.file,outdir,targetbed,pbem_dir,controls_dir,pacbamfolder_bychrom,replicas=1)
callsnvs <- function(sample.info.file,
                     outdir,
                     targetbed,
                     pbem_dir = file.path(outdir,"BaseErrorModel"),
                     controls_dir = file.path(outdir,"Controls"),
                     pacbamfolder_bychrom,
                     detection.specificity = 0.995,
                     replicas = 1000,
                     replicas.in.parallel = 1,
                     coeffvar.threshold = 0.01,
                     AFbycov = TRUE,
                     mincov = 10,
                     mincovgerm = 10,
                     minalt = 1,
                     maxafgerm = 0.2,
                     coverage_binning = 50,
                     outdir.calls.name = "Results",
                     chrom.in.parallel = 1){

  old_wd <- getwd()
  on.exit(setwd(old_wd))

  message(paste("[",Sys.time(),"]\tReading the sample.info.file"))
  sif <- import_sif(main_sif = sample.info.file)

  message(paste("[",Sys.time(),"]\tReading chromosomes from 'targetbed'"))
  chromosomes <- bed2positions(targetbed = targetbed,get_only_chromosomes = TRUE)[[1]]

  message(paste("[",Sys.time(),"]\tDetection of somatic SNVs in case samples"))
  if(!file.exists(file.path(outdir, outdir.calls.name))){
    dir.create(file.path(outdir, outdir.calls.name), showWarnings = TRUE)
  }

  # compute minaf_cov_corrected
  afthcor <- adjust_af_threshold_table(controls_dir = controls_dir,
                                       pbem_dir = pbem_dir,
                                       detection.specificity = detection.specificity,
                                       replicas = replicas,
                                       coverage_binning = coverage_binning,
                                       replicas.in.parallel = replicas.in.parallel,
                                       coeffvar.threshold = coeffvar.threshold)
  minaf_cov_corrected <- afthcor$minaf_cov_corrected

  # summary of thresholds applied
  fpam = data.frame(AFbycov = as.character(AFbycov),
                    spec = as.character(minaf_cov_corrected[1,1]),
                    mincov = as.character(mincov),
                    minalt = as.character(minalt),
                    mincovgerm = as.character(mincovgerm),
                    maxafgerm = as.character(maxafgerm),
                    filtering_date = Sys.time(),
                    stringsAsFactors = FALSE)
  write.table(fpam,file = file.path(outdir, outdir.calls.name, "filtering_criteria.txt"),col.names = TRUE,row.names = FALSE,sep="\t",quote = FALSE)

  # Import background pbem
  load(file.path(pbem_dir, "pbem_background.RData"))
  xbg <- as.numeric(bgpbem)

  TableSif <- sif$df_cases
  for(id in 1:nrow(TableSif)){
    this = TableSif[id,]
    name.patient = TableSif$patient[id]
    name.plasma = gsub(basename(this$plasma.bam),pattern = ".bam",replacement = "")
    name.germline = gsub(basename(this$germline.bam),pattern = ".bam",replacement = "")
    message(paste("[",Sys.time(),"]\tPatient:",name.patient,"\tCase:",name.plasma,"\tControl:",name.germline))
    out1 = paste0("pmtab_F1_",name.plasma,".tsv")
    out2 = paste0("pmtab_F2_",name.plasma,".tsv")
    out3 = paste0("pmtab_F3_",name.plasma,".tsv")
    # create sub-folder to save outputs per each case
    caseout_folder = file.path(outdir, outdir.calls.name, name.plasma)
    dir.create(caseout_folder,showWarnings = TRUE)

    if(is.na(name.germline)){

      germline.folder <- NA

      plasma.folder = list.files(pacbamfolder_bychrom, pattern = paste0(name.plasma,"$"),full.names = TRUE)
      if(length(plasma.folder)==0){
        message("[ ERROR ] Cannot find folder:\t",file.path(pacbamfolder_bychrom, name.plasma))
        stop()
      }

    } else {

      germline.folder = list.files(pacbamfolder_bychrom, pattern = paste0(name.germline,"$"),full.names = TRUE)
      if(length(germline.folder)==0){
        message("[ ERROR ] Cannot find folder:\t",file.path(pacbamfolder_bychrom, name.germline))
        stop()
      }
      plasma.folder = list.files(pacbamfolder_bychrom, pattern = paste0(name.plasma,"$"),full.names = TRUE)
      if(length(plasma.folder)==0){
        message("[ ERROR ] Cannot find folder:\t",file.path(pacbamfolder_bychrom, name.plasma))
        stop()
      }

    }

    # run in parallel on chromosomes
    mclapply(seq(1,length(chromosomes),1),
             filter,
             name.patient=name.patient,
             name.plasma=name.plasma,
             name.germline=name.germline,
             chromosomes=chromosomes,
             caseout_folder=caseout_folder,
             plasma.folder=plasma.folder,
             germline.folder=germline.folder,
             pbem_dir=pbem_dir,
             out1=out1,
             out2=out2,
             out3=out3,
             mincov=mincov,
             minalt=minalt,
             mincovgerm=mincovgerm,
             maxafgerm=maxafgerm,
             AFbycov=AFbycov,
             minaf_cov_corrected=minaf_cov_corrected,
             coverage_binning=coverage_binning,
             xbg=xbg,
             tab_cov_pbem=bombanel_tab_cov_pbem,
             afs=bombanel_afs,
             covs=bombanel_covs,
             mc.cores = chrom.in.parallel)

    # collapse all chromosome outs into a single table
    setwd(caseout_folder)
    tabs_list = list.files(caseout_folder,full.names = TRUE,recursive = TRUE,pattern = 'chrpm_f1.tsv')
    if(length(tabs_list)>0){
      mrgd <- lapply(tabs_list,fread,data.table=FALSE,stringsAsFactors = FALSE,header = FALSE)
      sapply(length(mrgd),function (x) write.table(mrgd[[x]],file=out1,append = TRUE,quote=FALSE,col.names=FALSE,row.names = FALSE,sep = "\t"))
    } else {
      cat(file = "f1_table_WARNING.txt","No calls found in chrpm_f1.tsv")
    }
    tabs_list = list.files(caseout_folder,full.names = TRUE,recursive = TRUE,pattern = 'chrpm_f2.tsv')
    if(length(tabs_list)>0){
      mrgd <- lapply(tabs_list,fread,data.table=FALSE,stringsAsFactors = FALSE,header = FALSE)
      sapply(length(mrgd),function (x) write.table(mrgd[[x]],file=out2,append = TRUE,quote=FALSE,col.names=FALSE,row.names = FALSE,sep = "\t"))
    } else {
      cat(file = "f2_table_WARNING.txt","No calls found in chrpm_f2.tsv")
    }
    tabs_list = list.files(caseout_folder,full.names = TRUE,recursive = TRUE,pattern = 'chrpm_f3.tsv')
    if(length(tabs_list)>0){
      mrgd <- lapply(tabs_list,fread,data.table=FALSE,stringsAsFactors = FALSE,header = FALSE)
      sapply(length(mrgd),function (x) write.table(mrgd[[x]],file=out3,append = TRUE,quote=FALSE,col.names=FALSE,row.names = FALSE,sep = "\t"))
    } else {
      cat(file = "f3_table_WARNING.txt","No calls found in chrpm_f3.tsv")
    }
  }

  # summary table index for calls
  tabsnvs_index <- get_tabcalls_path(TableSif = TableSif,
                                     outdir = outdir,
                                     outdir.calls.name = outdir.calls.name)

  message(paste("[",Sys.time(),"]\talright."))
  return(list(tabsnvs_index=tabsnvs_index))
}

Try the abemus package in your browser

Any scripts or data that you put into this service are public.

abemus documentation built on Dec. 19, 2019, 1:07 a.m.