R/functions.R
In PlasmaMutationDetector: Tumor Mutation Detection in Plasma

Documented in BuildCtrlErrorRate DetectPlasmaMutation LoadBackgroundErrorRate MAF_from_BAM PrepareLibrary

# release 1.3   : first release fully running
# release 1.4   : in BuildCtrlErrorRate, hotspot file was not used correctly -> corrected
#                 in DetectPlasmaMutation, adjbox is now computed on the finite values of the 250 largest values
# release 1.4.1 : correct dependencies for R version 3.3.1 and CRAN submission
# release 1.4.2 : add data for examples
# release 1.4.3 : reduce data size and change PrepareLibrary function to not use Exac file
# release 1.4.4 : reduce data size more to fit CRAN policy (< 5MB) and add possibility to use extension .bam.bai for bai files in MAF_from_BAM
# release 1.4.5 : call to data(...) corrected with data(...,envir = environment())
# release 1.4.6 : LICENSE correction
# release 1.4.7 : remove "don't run" with shrinked examples
# release 1.4.8 : remove warnings coming from search of InDel with less than 3 bases + from geom_point and geom_text (see YVES 5/09/2016 in MAF_from_BAM)
# release 1.4.9 : add file.path() to example to run under windows
# release 1.5.0 : remove file.path() and check whether pathnames end properly by a '/' (see # TO RUN UNDER WINDOWS)
# release 1.5.1 : downsample example to fit the <5s time constraint of the CRAN
# release 1.5.2 : use Nicolas example to fit the <5s time constraint of the CRAN and add the possibility to use zip BER
# release 1.6   : adapt the code to accept no large deletion, remove one mistake and simplify code
# release 1.6.1 : bug introduced during simplifcation fixed
# release 1.6.2 : bug introduced during simplifcation fixed
# release 1.6.3 : in BuildCtrlErrorRate, use hotspot file as data.frame 27/11/2017. Now hotspot file should contains a variable name chrpos on the first line
# release 1.6.4 : save file in tempdir() by default to satisfy CRAN policy e.g. for debian distribution
# release 1.6.5 : debug 1.6.4
# release 1.7.0 : take care of mutations showing allele frequency larger than 50% // clean the code
# release 1.7.1 : bug correction in find.outliers

# #' @importFrom GenomicRanges GRanges
#' @importClassesFrom S4Vectors FilterRules
#' @importClassesFrom SummarizedExperiment RangedSummarizedExperiment
#' @importMethodsFrom SummarizedExperiment rowRanges
#' @import GenomicRanges
#' @import VariantAnnotation
#' @importFrom Rsamtools PileupParam BamFile scanBam pileup filterBam ScanBamParam scanBamWhat
#' @import ggplot2
#' @import grid
#' @importFrom graphics abline plot points text
#' @importFrom stats dnorm integrate median pbinom quantile binom.test
#' @importFrom utils data download.file read.delim read.table write.table unzip
#'
#' @encoding{utf-8}

# Bioconductor packages
# Install :
# source("https://bioconductor.org/biocLite.R")
# biocLite("GenomicRanges")
# biocLite("VariantAnnotation")
# biocLite('Rsamtools')
# biocLite('rtracklayer')
# biocLite("SummarizedExperiment")

# R packages
require(robustbase)
require(ggplot2)
require(grid)
require(GenomicRanges)
require(VariantAnnotation)
require(Rsamtools)
require(rtracklayer)

##################### UTILITY FUNCTIONS #####################

c2n = function(x) as.numeric(as.character(x))

sbscore = function(refplus, refmin, altplus, altmin) {

  if (is.na(altplus)|is.na(altmin)) return(NA)

  rmM = function(x,y) min(x,y)/max(x,y)
  z = refplus*altmin/(refmin*altplus)
  log( rmM(refplus,refmin)/rmM(altplus,altmin) * (z+1/z) )
}

pvalue = function(E,N=NULL,E0=NULL,N0=NULL,with.info=FALSE,use.int=TRUE) {
  # return the pvalue
  if (is.null(N)) {N=E[2]; E0=E[3]; N0=E[4]; E=E[1]}
  if (E0==0) E0=1/sqrt(N0) # provides an upper bound for the p-value by assuming control error rate equals 1/N0^(3/2)
  p0.hat = E0/N0
  sigma0.hat = sqrt(p0.hat*(1-p0.hat)/N0)
  if (with.info) {
    print(paste0('p0.hat=',p0.hat,' --- p.hat=',E/N))
    print(sigma0.hat)
  }
  psi = function(y) pbinom(E,N,pmax(pmin(p0.hat-sigma0.hat*y,1),0),lower.tail=F)*dnorm(y)
  if (use.int) p.val = integrate(psi,lower=-30,upper=30,subdivisions=10000L)$value else p.val = binom.test(E,N,p0.hat,'great')$p.value
  p.val
}


minus.logit = function(P) {
  sapply(P,function(x) {if ((!is.nan(x))&(x<1)) return(-log(x/(1-x))) else return(-Inf)})
}

find.outliers = function(P,chr.pos,thr.R=6.5,thr.P=0.000001,with.info=FALSE) {

  P[which(P==1)]=0.9999999999 ### just to have finite values
  names(P) = chr.pos
  ind.null = P==0
  P.null = P[which(ind.null)]
  P = P[which(!ind.null)]
  nP = length(P)
  L.tri = sort(minus.logit(P))
  dL = diff(L.tri);
  L.right = L.tri[-1];
  L.left = L.tri[-nP];
  sL.sqrt = sign(L.left)*(abs(L.left))^0.5 # to take the square root without changing the sign
  Ratio = dL/sL.sqrt

  outliers = c(which( (Ratio>thr.R) & (L.right>minus.logit(thr.P))),P.null)

  ### ADD YVES 29/03/2016
  # if (length(outliers)>0) outliers = which(L.right>=min(L.right[min(outliers)]))

  ind = names(outliers)

  if (with.info) { ### just for debugging purpose. Not used.
    plot(L.tri[-1][L.left>0],Ratio[L.left>0],col=1)
    abline(v=minus.logit(thr.P),col='red'); abline(h=thr.R,col='green'); abline(h=6,col='cyan'); abline(h=7,col='violet');
    # if (length(outliers)>0) outliers = which(L.right>=min(L.right[min(outliers)]))
    if (length(outliers)>0) points(L.right[outliers],Ratio[outliers],col='red',pch='x')
    if (length(outliers)>0) text(L.right[outliers],Ratio[outliers],labels=ind,cex=0.6,pos=2,col=1)
    if (length(outliers)>0) print(sort(P,decreasing=T)[outliers+1])
  }

  ind
}

UpperPart = function(values,q) {
  # Find the locations inside a set of values higher than an upper quantile of these values
  # Return the indices of the highest values, larger than the given quantile
  noisy = which(values > quantile(values,q))
  noisy
}

pileupFreq = function(pileupres) {
  nucleotides = levels(pileupres$nucleotide)
  res = split(pileupres, pileupres$seqnames)
  res = lapply(res, function (x) split(x, x$pos))
  res = lapply(res, function (positionsplit) {
    nuctab = lapply(positionsplit, function(each) {
      chr = as.character(unique(each$seqnames))
      pos = as.character(unique(each$pos))
      tablecounts = sapply(nucleotides, function (n)
        sum(each$count[each$nucleotide == n]))
      tablecountsplus = sapply(nucleotides, function (n)
        sum(each$count[each$nucleotide == n & each$strand=="+"]))
      tablecountsminus = sapply(nucleotides, function (n)
        sum(each$count[each$nucleotide == n & each$strand=="-"]))
      strandbias = sapply(nucleotides, function (n)
        max(0,each$count[each$strand=="+" & each$nucleotide == n])/
          (max(0,each$count[each$strand=="+" & each$nucleotide == n]) + max(0,each$count[each$strand=="-" & each$nucleotide == n]))
      )
      c(chr,pos, tablecounts, tablecountsplus, tablecountsminus, strandbias)
    })
    nuctab = data.frame(do.call("rbind", nuctab),stringsAsFactors=F)
    rownames(nuctab) = NULL
    nuctab
  })
  res = data.frame(do.call("rbind", res),stringsAsFactors=F)
  rownames(res) = NULL
  colnames(res) = c("seqnames","start",levels(pileupres$nucleotide),
                    paste0(levels(pileupres$nucleotide),"_plus"),
                    paste0(levels(pileupres$nucleotide),"_minus"),
                    paste0(levels(pileupres$nucleotide),"_SB"))
  res[3:ncol(res)] = apply(res[3:ncol(res)], 2, as.numeric)
  res
}

############ END OF UTILITY FUNCTIONS ##################


############ DOCUMENTED FUNCTIONS ######################

#' function PrepareLibrary
#'
#' Define the Genomic Ranges and Genomic Positions covered by the AmpliSeq™ Panel to include in the study and define SNP positions to exclude from the study.
#' Trimming amplicon ends is performed if specified. This function is mostly useful if you want to add some SNP positions which are not existing in the
#' positions_ranges.rda file provided within the package. It is provided to be able to reconstruct \code{positions_ranges.rda} data.
#'
#' @param info.dir, char, name of the folder containing the library information files (default 'Info/')
#' @param bed.filename, char, name of a BED table (tab-delimited) describing the Panel (with first 3 columns: "chr" (ex:chr1), "start position" (ex:115252190), "end position" (ex:115252305), i.e. the Ion AmpliSeq™ Colon and Lung Cancer Research Panel v2 (default 'lungcolonV2.bed.txt' as provided in the inst/extdata/Info folder of the package).
#' @param snp.filename, char, name of the vcf file describing known SNP positions, obtained from ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz (default 'ExAC.r0.3.sites.vep.vcf.gz'). It requires a corresponding TBI file to be in the same folder (obtained from ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/ExAC.r0.3.sites.vep.vcf.gz.tbi)
#' @param snp.extra, a vector of char, a vector of extra known snp positions manually curated (ex:"chrN:XXXXXXXXX")
#' @param output.name, char, filename to save \code{pos_ind} and \code{pos_snp} (default 'positions_ranges.rda')
#' @param output.dir, char, directory where to save \code{pos_ind} and \code{pos_snp} (default \code{info.dir})
#' @param load.from.broad.insitute, boolean, if TRUE load \code{snp.filename} from Broad Institute ftp server otherwise use the file positions_ranges_broad.rda (default FALSE)
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @seealso positions_ranges,
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
#'
#' @return Save the following variables in a .rda file defined by \code{output.name} in the folder defined by \code{output.dir}:
#' \itemize{
#' \item \code{pos_ranges}, a GRanges descriptor of amplicon positions
#' \item \code{pos_ind}, a vector of char "chrN:XXXXXXXXX", defining ALL index positions
#' \item \code{pos_snp}, a vector of char "chrN:XXXXXXXXX", defining SNP positions
#' }
#'
#' @export PrepareLibrary
#'
#' @examples
#'    bad.pos = "chr7:15478"
#'    PrepareLibrary(info.dir='./',snp.extra=bad.pos,output.dir=paste0(tempdir(),'/'))
#'
#'
PrepareLibrary = function(info.dir='Info/',
                          bed.filename='lungcolonV2.bed.txt', # amplicon positions
                          snp.filename='ExAC.r0.3.sites.vep.vcf.gz', # known snp
                          snp.extra=c("chr2:212812097", "chr4:1807909", "chr7:140481511", "chr14:105246474", "chr18:48586344", "chr19:1223055"), # extra snp
                          output.name='positions_ranges.rda',
                          output.dir=info.dir,
                          load.from.broad.insitute=FALSE
) {

  IRanges = NULL

  # Find SNP
  snp.file = paste0(info.dir,snp.filename)
  tbi.file = paste0(snp.file,'.tbi')
  if (!file.exists(snp.file) | !file.exists(tbi.file)) {
    print(paste0('File ',snp.file,' or ',tbi.file,' do not exist in ',info.dir,' ...'))
    if (load.from.broad.insitute) {
      print('Download ExAC release 0.3 from Broad Institute ... please wait !!!')
      if (!file.exists(snp.file)) download.file(paste0('ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/',snp.filename), snp.file, method='libcurl')
      if (!file.exists(tbi.file)) download.file(paste0('ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3/',snp.filename,'.tbi'), paste0(snp.file,'.tbi'), method='libcurl')
    } else {
      # DEFAULT : use provided file positions_ranges.rda
      print('Use positions_ranges.rda of the package')
      data(positions_ranges,envir = environment())
    }
  }

  if (file.exists(snp.file) & file.exists(tbi.file)) {
    # Read bed file
    bed.file = paste0(info.dir,bed.filename)
    if (!file.exists(bed.file)) stop(paste0(bed.file,' not found in ',info.dir))

    bed = read.delim(bed.file, header=FALSE) # original amplicons
    # Build index of all positions
    pos_ind = NULL
    for(i in 1:nrow(bed)) pos_ind = c(pos_ind,paste0(bed[i,1],':',bed[i,2]:bed[i,3]))

    # original positions for each amplicons
    pos_ranges = GRanges(bed[,1], IRanges(bed[,2], bed[,3]))

    # Build our snp selection
    vcf = readVcf(snp.file, "hg19", param=ScanVcfParam(which=GRanges(gsub('chr','',bed[,1]), IRanges(bed[,2], bed[,3]))))
    snp = cbind(info(vcf),as.data.frame(rowRanges(vcf))) # extract vcf information
    snp$AF = sapply(snp$AF, max) # consider only max allele frequency
    snp = snp[which(snp$AF>1e-4),] # only "real" snp with Allele Frequency >1e-4
    pos_snp = paste0('chr',snp$seqnames,':',snp$start) # build snp positions
  }

  pos_snp = c(pos_snp,snp.extra) # add the SNP positions you want


  output.name = paste0(output.dir,output.name)
  save(pos_ranges,pos_ind,pos_snp,file=output.name)
  print('File saved in ...')
  return(output.name)
}


#' function MAF_from_BAM
#'
#' Read BAM files  and create MAF file. BAMfiles are stored in a sub-folder '/rBAM'.
#' MAF files are intermediate files stored in a sub-folder '/BER'.
#' MAF files contain the raw counts of A,T,C,G, insertion, deletion, insertion>2bp, deletion >2bp for strand plus and stand minus.
#' Note : we strongly recommand to externally recalibrate BAM files using tools like GATK.
#'
#' @param study.dir, char, name of the folder containing the rBAM directory  (default 'Plasma/'). The typical folder hierarchy will consist of 'Plasma/rBAM'
#' @param input.filenames, a vector of char (default NULL), the names of the BAM files to process. If NULL all BAM files in the rBAM folder will be processed
#' @param bai.ext, char, filename extension of the bai files (default '.bai')
#' @param pos_ranges.file, char, name of the Rdata file containing the three variables \code{pos_ind}, \code{pos_snp} and \code{pos_ranges} as build by the function \code{PrepareLibrary}. Default NULL, use the position_ranges.rda provided, used for our analysis.
#' @param force, boolean, (default FALSE) if TRUE force all computations to all files including already processed ones
#' @param output.dir, char, name of the folder to save results  (default \code{study.dir}).
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
#'
#' @return the path/names of the MAF files
#'
#' @export MAF_from_BAM
#'
#' @examples
#'   \dontrun{
#'      ctrl.dir = system.file("extdata", "4test_only/ctrl/", package = "PlasmaMutationDetector")
#'      if (substr(ctrl.dir,nchar(ctrl.dir),nchar(ctrl.dir))!='/')
#'        ctrl.dir = paste0(ctrl.dir,'/') # TO RUN UNDER WINDOWS
#'      MAF_from_BAM(ctrl.dir,force=TRUE,output.dir=paste0(tempdir(),'/'))
#'    }
#'
MAF_from_BAM = function(study.dir='Plasma/',input.filenames=NULL,bai.ext='.bai',
                        pos_ranges.file=NULL,force=FALSE,output.dir=study.dir) {

  pos_ind = pos_snp = pos_ranges = NULL

  if (is.null(pos_ranges.file)) {
    data(positions_ranges,envir=environment())
  } else {
    if (!file.exists(pos_ranges.file)) stop(paste('STOP :::',pos_ranges.file,' does not exist'))
    load(pos_ranges.file)
  }

  ######## Input/Output folders
  bam.dir = paste0(study.dir,'rBAM/')        # Folder for BAM files and BAI index files
  ber.dir = paste0(study.dir,'BER/')         # Default folder for BER files
  ber.outdir = paste0(output.dir,'BER/')     # Folder to save Background Error Rate files extracted from BAM files
  if (!dir.exists(ber.outdir)) dir.create(ber.outdir)

  ######## pileup parameters
  pileup.param = PileupParam(max_depth=100000, min_base_quality=20, min_mapq=5, min_nucleotide_depth=1, min_minor_allele_depth=0, # min_base_quality=20, min_mapq=5
                             distinguish_strands=TRUE, distinguish_nucleotides=TRUE, ignore_query_Ns=TRUE, include_deletions=TRUE,
                             include_insertions=TRUE)

  ######## Samples to process
  if (is.null(input.filenames)) {
    print(paste0("Looking for bam and bai in ",bam.dir,' ...'))
    input.filenames = dir(bam.dir, pattern="*.bam$") # list all BAM files in bam.dir
  }

  bam.files = paste0(bam.dir,input.filenames) # build bam.files as paste0(study.dir,'rBAM/',input.filenames[i])

  if (length(bam.files)==0) stop(paste('STOP ::: no bam file found ... NOTE: bam files need to be in a subdirectory called rBAM'))

  ber.files = rep(NA,length(bam.files))

  for( g in 1:length(bam.files)){

    bam.name = bam.files[g]
    bai.name = gsub('.bam$',bai.ext,bam.name)
    ber.name = gsub(bam.dir,ber.dir,gsub('.bam$','_MAF.txt',bam.name),fixed=TRUE) # local BER
    berzip.name = paste0(ber.name,'.zip') # local zip BER
    ber.outname = gsub(bam.dir,ber.outdir,gsub('.bam$','_MAF.txt',bam.name),fixed=TRUE) # BER to create
    berzip.outname = paste0(ber.outname,'.zip') # BER to create in zip version

    ber.files[g] = ber.outname

    if (!force & file.exists(ber.name)) {
      ber.files[g] = ber.name
      next()
    }
    if (!force & file.exists(ber.outname)) {
      ber.files[g] = ber.outname
      next()
    }

    if (!force & file.exists(berzip.name)) {
      ber.files[g] = unzip(zipfile=berzip.name,exdir=ber.outdir)
      # write.table(readLines(unz(berzip.name,rev(strsplit(ber.name,'/')[[1]])[1])),file=ber.outname,sep=' ',row.names=FALSE,col.names=FALSE,quote=FALSE)
      next()
    }
    if (!force & file.exists(berzip.outname)) {
      ber.files[g] = ber.files[g] = unzip(zipfile=berzip.outname,exdir=ber.outdir)
      # write.table(readLines(unz(berzip.outname,rev(strsplit(ber.name,'/')[[1]])[1])),file=ber.outname,sep=' ',row.names=FALSE,col.names=FALSE,quote=FALSE)
      next()
    }

    print(paste0('Processing MAF from file ',bam.name))

    ### Load BAM and BAI files
    if (!file.exists(bam.name)) stop(paste('STOP :::',bam.name,'not found ... NOTE : bam files need to be in a subdirectory called rBAM'))
    if (!file.exists(bai.name)) {
      bai.sndname = paste0(bam.name,'.bai')
      if (!file.exists(bai.sndname)) stop(paste('STOP :::',bai.name,' and ',bai.sndname,'not found ... NOTE : bai files need to be in a subdirectory called rBAM'))
      bai.name = bai.sndname
    }
    bf = BamFile(bam.name, index=bai.name)

    ### Import, count, index, filter, sort, and merge BAM
    ft = scanBam(bf, scanBamParam=ScanBamParam(which=pos_ranges, what=scanBamWhat()), pileupParam=pileup.param)

    ### Count SNV from the BAM file
    res = pileup(bf, scanBamParam=ScanBamParam(which=pos_ranges, what=scanBamWhat()), pileupParam=pileup.param)

    ### Call function pileupFreq to get a data.frame of counts
    freq = pileupFreq(res)

    freq$chrpos = paste(freq[,1],freq[,2], sep=":") # positions existing in the BAM file
    select = freq[which(freq$chrpos %in% pos_ind),] # extract positions of interest


    ### Compute coverture; major coverture, major frequency, background frequency, second major coverture
    select[,'cov']=select[,3]+select[,4]+select[,5]+select[,6]
    select[,'maj']=apply(select[,3:6],1,max)
    select[,'fmaj']=select$maj/select$cov
    select[,'bruitdefond']=1-select$fmaj
    select[,'MA'] = apply(select[,3:6],1,sort)[3,]
    # select[,'M3'] = apply(select[,3:6],1,sort)[2,]
    # select[,'M4'] = apply(select[,3:6],1,sort)[1,]

    ### Select and count reads with DELETIONS >2 nucleotides
    dels = grep("[D]", ft[[1]][["cigar"]], invert=FALSE)  # index reads carrying deletions
    numdel = regmatches(as.character(ft[[1]]$cigar[dels]), gregexpr('([0-9]+)D',ft[[1]]$cigar[dels])) # get number of deleted bp
    numdel.max = sapply(lapply(numdel,function(x) as.numeric(gsub('D','',x)) ),max) # for reads with multiple deletions, get the maximum number of deleted bp
    selection = dels[which(as.numeric(numdel.max)>2)] # index reads carrying deletions > 2 bp
    subft = sapply(ft[[1]],function(x) x[selection] ) # generate filter for temporary BAMfile export that contain reads with deletion >2bp only

    ### Select read ids with deletion >2bp
    filter = S4Vectors::FilterRules(list(KeepQname = function(ft) ft$qname %in% subft$qname))
    ### Build a temporary BAM file with selected reads
    dest = filterBam(bam.name, tempfile(),index=bai.name, filter=filter, param=ScanBamParam(what="qname"))
    ### Import this temporary BAM file
    gal = rtracklayer::import(dest,format='bam', param=ScanBamParam(what=(c("qname", "seq", "qual"))) ) # import the temporary BAMfile that contain reads with deletion >2bp only

    ### Select read ids with long deletions, skip the deletions < 3bp that could co-occur in the same read by replacing CIGAR D by CIGAR N
    for (z in 1:length(gal@cigar)) {
      mm = regmatches(gal@cigar[z], gregexpr('[0-9]+[A-Z]',gal@cigar[z]))
      ind = grep('D',mm[[1]])       # YVES 5/09/2016
      if (length(ind)>0) {          # YVES 5/09/2016
        jnd = which(as.numeric(gsub('D','',mm[[1]][ind]))<3)                    # YVES 5/09/2016
        if (length(jnd)>0) mm[[1]][ind[jnd]] = gsub('D','N',mm[[1]][ind[jnd]])  # YVES 5/09/2016
      }                             # YVES 5/09/2016
      # mm[[1]][which(as.numeric(as.character(gsub("D", "", mm[[1]]))) < 3)]=gsub("D", "N", mm[[1]][which(as.numeric(as.character(gsub("D", "", mm[[1]]))) < 3)])
      gal@cigar[z] = paste(mm[[1]], collapse="")
    }
    ### Build a temporary BAM file with selected reads
    dest = rtracklayer::export(gal, BamFile(tempfile())) # export the temporary BAMfile cleared of any deletions < 3bp
    ### Import this temporary BAM file
    res = pileup(dest, scanBamParam=ScanBamParam(which=pos_ranges, what=scanBamWhat()), pileupParam=pileup.param)

    ### Build a data.frame of long deletion counts
    if (nrow(res)>0) {
      freq = pileupFreq(res)	#read the cleared temporary BAMfile
      contig = freq[which(freq[,'-'] > 0),c(1:9,17,25,33)] #create dataframe for long deletions
      contig$contig = rep(NA,nrow(contig)) #add a new variable to define starting point of deletion
      # print(names(contig))

      # YVES 08/11/2017
      # A simpler way to find large deletions (>2bp) and assign : YVES 8/11/2017
      # frequency consistence changes in this version
      loc = dloc = 1
      while(loc<nrow(contig)) {
        while ((loc+dloc<=nrow(contig)) & (as.numeric(contig$start[loc])+dloc==as.numeric(contig$start[loc+dloc])) &
               (abs(mean(contig[loc+(0:(dloc-1)),'-'])-contig[loc+dloc,'-'])/mean(contig[loc+(0:(dloc-1)),'-']) < 0.2)) dloc = dloc+1
        dloc = dloc-1 # now dloc is the number of bases with same deletion as loc
        if (dloc>1) { # found large deletion (>2bp)
          # print(loc+(0:dloc))
          # cat(contig$seqnames[loc+(0:dloc)],'\t'); cat(contig$seqnames[loc+dloc+(1:3)],'\n')
          # cat(contig$start[loc+(0:dloc)],'\t'); cat(contig$start[loc+dloc+(1:3)],'\n')
          # cat((contig[loc+(1:dloc),'-']-contig[loc,'-'])/contig[loc,'-'],'\t'); cat((contig[loc+dloc+(1:3),'-']-contig[loc,'-'])/contig[loc,'-'],'\n')
          contig$contig[loc+(0:dloc)] = loc
          loc = loc+dloc+1
        } else {
          loc = loc+1
        }

        dloc = 1
      }

      ### remove positions with inconsistent deletions (<2bp or different frequency)
      contig2 = contig[!is.na(contig$contig),]

      contig2$chrpos = rep(NA,nrow(contig2)) # to avoid pb when empty

      if (nrow(contig2)>0) contig2$chrpos=paste0(contig2$seqnames,":",contig2$start)

      # YVES 08/11/2017
      ### A warning to prevent the false duplication of deletions
      if (any(duplicated(contig2$chrpos)))
          cat(' !!!!!!! warning  !!!!! duplicates in deletion for sample',bam.files[g],'\n')

    } else {
      contig2 = as.data.frame(matrix(nrow=0,ncol=14))
      colnames(contig2)=c("seqnames","start","A","C","G","T","N","=","-","-_plus","-_minus","-_SB","contig","chrpos")
    }

    ### Select and count reads with INSERTIONS >2 nucleotides
    ins = grep("[I]", ft[[1]][["cigar"]], invert=FALSE) # index reads carrying insertions
    numins = regmatches(as.character(ft[[1]]$cigar[ins]), gregexpr('([0-9]+)I',ft[[1]]$cigar[ins])) # get number of inserted bp
    numins2 = sapply(sapply(numins,function(x){gsub('I','',x)}),max) # for reads with multiple insertions, get the maximum number of inserted bp
    selection=ins[which(as.numeric(numins2)>2)] # index reads carrying deletions > 2bp
    subft = sapply(ft[[1]],function(x) x[selection])

    ### Select read ids with deletion >2bp only
    filter = S4Vectors::FilterRules(list(KeepQname = function(ft) ft$qname %in% subft$qname))
    ### Build a temporary BAM file with selected reads
    dest = filterBam(bam.name, tempfile(),index=bai.name, filter=filter, param=ScanBamParam(what="qname"))
    ### Import this temporary BAM file
    gal = rtracklayer::import(dest,format='bam', param=ScanBamParam(what=(c("qname", "seq", "qual"))) ) # import the temporary BAMfile that contain reads with deletion >2bp only

    # For reads with long insertions, skip the insertions < 3bp that could co-occur in the same read by replacing CIGAR I by CIGAR S
    for (z in 1:length(gal@cigar)){
      mm = regmatches(gal@cigar[z], gregexpr('[0-9]+[A-Z]',gal@cigar[z]))
      ind = grep('I',mm[[1]])       # YVES 5/09/2016
      if (length(ind)>0) {          # YVES 5/09/2016
        jnd = which(as.numeric(gsub('I','',mm[[1]][ind]))<3)                    # YVES 5/09/2016
        if (length(jnd)>0) mm[[1]][ind[jnd]] = gsub('I','S',mm[[1]][ind[jnd]])  # YVES 5/09/2016
      }                             # YVES 5/09/2016
      # mm[[1]][which(as.numeric(as.character(gsub("I", "", mm[[1]]))) < 3)]=gsub("I", "S", mm[[1]][which(as.numeric(as.character(gsub("I", "", mm[[1]]))) < 3)])
      gal@cigar[z] = paste(mm[[1]], collapse="")
    }
    ### Build a temporary BAM file with selected reads
    dest = rtracklayer::export(gal, BamFile(tempfile())) # export the temporary BAMfile cleared of any insertsions < 3bp
    ### Import this temporary BAM file
    res = pileup(dest, scanBamParam=ScanBamParam(which=pos_ranges, what=scanBamWhat()), pileupParam=pileup.param)

    if(nrow(res)>0) {
      freq = pileupFreq(res)		# read the cleared temporary BAMfile
      incontig = freq[which(freq[,'+'] > 0),c(1:8,10,18,26,34)] # create dataframe for long insertions
    } else {
      incontig = as.data.frame(matrix(nrow=0,ncol=13))
      colnames(incontig)=c("seqnames","start","A","C","G","T","N","=","+","+_plus","+_minus","+_SB","chrpos")
    }
    unlink(dest)

    ##### Merge dataframes for SNV, DELETIONS>2bp and INSERTIONS>2bp
    if (nrow(incontig)>0) incontig[,'chrpos']=paste(incontig[,'seqnames'],":",incontig[,'start'],sep='')

    mergeindels = merge(contig2[,c(9:14)], incontig[,9:13], by='chrpos', all=TRUE)
    colnames(mergeindels) = c('chrpos','longdel','deletion_plus','deletion_minus','longdel_SB','contig','longins','insertion_plus','insertion_minus','longins_SB')
    select = merge(select, mergeindels, by='chrpos', all=TRUE)
    select[is.na(select$longdel),'longdel'] = 0
    select[is.na(select$longins),'longins'] = 0
    select[,'longINDEL'] = apply(select[,c('longdel','longins')],1,max)

    ### save ber.file
    write.table(select[!duplicated(select$chrpos),], file=ber.outname, row.names=FALSE, col.names=TRUE) ####  save temporary files
  }

  return(ber.files)

}


#' function BuildCtrlErrorRate
#'
#' Compute the SNV Position-Error Rates and INDEL Position-Error Rates from control samples (available in the control directory \code{ctrl.dir}).
#' This function requires MAF files, that will be automatically generated if not present in the specified control folder.
#' SNV PER is computed as the sum in control samples of SNV background counts / sum in control samples of depths where SNV background counts = depth - major allele count.
#' INDEL PER is computed as sum in control samples of INDEL background counts / sum in control samples of depths where INDEL background counts = sum of insertion and deletion counts.
#'
#' @param ctrl.dir, char, foldername containing the control files (default 'Plasma ctrl/'). The typical folder hierarchy will consist of 'Plasma ctrl/rBAM'
#' @param bai.ext, char, filename extension of the bai files (default '.bai')
#' @param pos_ranges.file, char, name of the Rdata file containing the three variables \code{pos_ind}, \code{pos_snp} and \code{pos_ranges} as build by the function \code{PrepareLibrary}. Default NULL, use the position_ranges.rda provided, used for our analysis.
#' @param hotspot.file, char, name of the text file containing a list of the genomic positions of the hotspots (default NULL, read the provide hotspot.txt, see \code{hotspot})
#' @param force, boolean, (default FALSE) if TRUE force all computations to all files including already processed ones
#' @param output.dir, char, name of the folder to save results (default \code{ctrl.dir}).
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
#'
#' @return the number of processed files
#'
#' @export BuildCtrlErrorRate
#'
#' @examples
#' \dontrun{
#'    ctrl.dir = system.file("extdata", "4test_only/ctrl/", package = "PlasmaMutationDetector")
#'    if (substr(ctrl.dir,nchar(ctrl.dir),nchar(ctrl.dir))!='/')
#'      ctrl.dir = paste0(ctrl.dir,'/') # TO RUN UNDER WINDOWS
#'    BuildCtrlErrorRate(ctrl.dir,output.dir=paste0(tempdir(),'/'))
#'    }
#'
BuildCtrlErrorRate = function(ctrl.dir='Plasma ctrl/',bai.ext='.bai',pos_ranges.file=NULL,hotspot.file=NULL,
                              force=FALSE,output.dir=ctrl.dir) {


  ### read position informations
  pos_snp = pos_ind = NULL
  if (is.null(pos_ranges.file)) {
    data(positions_ranges,envir=environment())
  } else {
    if (!file.exists(pos_ranges.file)) stop(paste('STOP :::',pos_ranges.file,'does not exist'))
    load(pos_ranges.file)
  }

  ### read hotspot positions
  hotspot = NULL
  if (is.null(hotspot.file)) {
    data(hotspot,envir=environment())
  } else {
    if (!file.exists(hotspot.file)) stop(paste('STOP :::',hotspot.file,'does not exist'))
    hotspot = read.delim(hotspot.file, header=TRUE) # YVES 27/11/2017 FALSE -> TRUE
    if (!any(colnames(hotspot)=='chrpos')) stop(paste('STOP :::',hotspot.file,'has to contain a variable named chrpos (defined on first line)'))
  }

  ### extract counts from all BAM files found in ctrl.dir
  ctrl.files =  MAF_from_BAM(study.dir=ctrl.dir,input.filenames=NULL,bai.ext=bai.ext,pos_ranges.file=pos_ranges.file,force=force,output.dir=output.dir)

  print(ctrl.files)

  # ### list Background-Error-Rate files
  # ctrl.files = dir(paste0(ctrl.dir,'BER'), pattern="*.txt$", full.names=TRUE) # list all Background Error Rate files # before 1.6.5

  ### Assemble background errors from control samples
  All = X = read.table(ctrl.files[1], header=TRUE, sep=" ")

  ### utility function
  ATGC.ind = which(colnames(X) %in% c('A','T','G','C'))
  ATGC = colnames(X)[ATGC.ind]
  fct.MAJ = function(X) {
    data.frame(chrpos=as.character(X$chrpos),REF=apply(X,1,function(x) ATGC[which(x[ATGC.ind]==x['maj'])[1]]))
  }

  # extract main variant = genome ref
  MAJ = fct.MAJ(X)

  if (length(ctrl.files)>1) ind.g = 2:length(ctrl.files) else ind.g = 1 # just for testing purpose with one bam
  for (g in ind.g) {
    Y = read.table(ctrl.files[g],header=T,sep=" ")
    X = merge(Y,X,by="chrpos",all=TRUE,suffixes=c(as.character(g),''))
    MAJ = merge(fct.MAJ(Y),MAJ,by='chrpos',all=TRUE,suffixes=c(as.character(g),''))
    All.plus = merge(All,subset(Y,select='chrpos'),by='chrpos',all=TRUE,suffixes=c('',as.character(g)))[,colnames(All)]
    Y.plus = merge(subset(All,select='chrpos'),Y,by='chrpos',all=TRUE,suffixes=c('',as.character(g)))[,colnames(All)]
    for (j in colnames(All.plus))
      if (is.numeric(All.plus[,j]) & j!='start') {
        All.plus[is.na(All.plus[,j]),j] = 0
        Y.plus[is.na(Y.plus[,j]),j] = 0
        All[,j] = All.plus[,j] + Y.plus[,j]
      }
  }
  for (j in c('A','T','G','C','N'))
    All[,paste0(j,'_SB')] = All[,paste0(j,'_plus')]/All[,j]
  All = All[,c('chrpos','A','T','G','C','N',paste0(c('A','T','G','C','N'),'_SB'))]

  ok = (X$chrpos %in% pos_ind)
  not.ok = (X$chrpos %in% pos_snp)
  X = X[ok,] #### ensure to select positions in defined genomic ranges
  X = X[!not.ok,] #### ensure to remove SNP positions and background error rate for SNPs (E0) and INDELS (E0indel) in samples
  MAJ = MAJ[ok,]
  MAJ = MAJ[!not.ok,]
  All = All[ok,]
  All = All[!not.ok,]

  X.col = colnames(X)
  col.cov = grep('cov',X.col) # column indexes of the coverture
  col.del = grep('longdel($|[^_])',X.col) # column indexes of the longdel
  col.ins = grep('longins($|[^_])',X.col) # column indexes of the longins
  col.maj = grep('^maj',X.col) # column indexes of the major

  sum.no.na = function(x) sum(x,na.rm=TRUE)
  q95 = function(x) quantile(x,0.95,na.rm=TRUE)

  N0 = apply(X[,col.cov],1,sum.no.na) # Sum of the depths of all control samples for each position
  E0 = apply(X[,col.cov]-X[,col.maj],1,sum.no.na) # Sum of the SNV background counts of all control samples for each position

  X = X[which(!is.na(N0) | N0 > 5000),]  # Remove genomic positions which are not covered in all control samples or with sum of depths < 5,000x
  MAJ = MAJ[which(!is.na(N0) | N0 > 5000),]
  All = All[which(!is.na(N0) | N0 > 5000),]

  REF = apply(MAJ,1,function(x) names(which.max(table(as.character(x[-1])))))
  MAJ$the.ref = REF
  write.table(MAJ, file='MAJ.txt', row.names=FALSE, col.names=TRUE)
  write.table(All, file='All.txt', row.names=FALSE, col.names=TRUE)

  ### SNV rates
  p.sain = E0/N0 # Control SNV Position-Error Rate
  up.sain = apply((X[,col.cov]-X[,col.maj])/X[,col.cov],1,q95) # 95th percentile for Control SNV Position-Error Rate

  ### INDEL rates
  E0indel = apply(cbind(X[,col.del], X[,col.ins]),1,sum.no.na) # Sum of the INDEL background counts of all control samples for each position
  indel.p.sain = E0indel/N0 # Control INDEL Position-Error Rate
  indel.up.sain = apply((X[,col.del] + X[,col.ins]) / X[,col.cov], 1, q95) # 95th percentile for Control INDEL Position-Error Rate

  # Joint dataframe containing Control SNV and INDEL Position-Error Rate together with their 95th percentile, and the raw counts data
  chrpos = as.character(X$chrpos)
  background_error_rate = data.frame(cbind(chrpos,N0,E0,p.sain,up.sain,E0indel,indel.p.sain,indel.up.sain,REF))
  # background_error_rate = merge(background_error_rate,All,by='chrpos',all=TRUE) # Does not work

  # Precise what are the hotspot positions
  background_error_rate$hotspot = 'Non-hotspot'
  background_error_rate[which(background_error_rate$chrpos %in% hotspot$chrpos ),'hotspot'] = 'Hotspot'

  # Save Background error file...
  backround_error_file = paste0(output.dir,"background_error_rate.txt")
  print('Backround error rate saved in ...')
  print(paste0('... ',backround_error_file))
  write.table(background_error_rate, file=backround_error_file, row.names=FALSE, col.names=TRUE)

  return(length(ctrl.files))
}


#' function LoadBackgroundErrorRate
#'
#' This function will load the background error rates created from the controls using the function BuildCtrlErrorRate
#'
#' @param pos_ranges.file, char, name of the Rdata file containing the three variables \code{pos_ind}, \code{pos_snp}, \code{pos_ranges} as build by the function \code{PrepareLibrary}. Default NULL, use the position_ranges.rda provides that we used for our analysis.
#' @param ber.ctrl.file, char, pathname of the file providing the background error rates obtained from the controls (default NULL use the provided background error rates obtained from our 29 controls). See \code{background_error_rate.txt} data and \code{BuildCtrlErrorRate} function.
#' @param n.trim, integer, number of base positions trimmed at the ends of each amplicon (default 8)
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
#'
#' @return the adapted background error rate
#'
LoadBackgroundErrorRate = function(pos_ranges.file,ber.ctrl.file,n.trim) {
  ### read background error rate
  background_error_rate = NULL
  if (is.null(ber.ctrl.file)) {
    data('background_error_rate',envir=environment()) # load default background error rate obtained from the 29 controls
  } else {
    if (!file.exists(ber.ctrl.file)) stop(paste('STOP:::File',ber.ctrl.file,'does not exist'))
    background_error_rate = read.table(ber.ctrl.file, header=T, sep=" ")
  }

  ### read position informations
  pos_ranges = pos_snp = pos_ind = NULL
  if (is.null(pos_ranges.file)) {
    data(positions_ranges,envir=environment())
  } else {
    if (!file.exists(pos_ranges.file)) stop(paste('STOP :::',pos_ranges.file,' does not exist'))
    load(pos_ranges.file)
  }

  if (n.trim>0) {
    pos_ind = NULL
    chrpos = rep(pos_ranges@seqnames@values,times=pos_ranges@seqnames@lengths)
    for(i in 1:length(chrpos)) {
      start = pos_ranges@ranges@start[i]
      end = start+pos_ranges@ranges@width[i]-1
      pos_ind = c(pos_ind,paste0(chrpos[i],':',(start+n.trim):(end-n.trim)))
    }
  }

  background_error_rate = subset(background_error_rate,chrpos %in% pos_ind)
  background_error_rate
}


#' function DetectPlasmaMutation
#'
#' This is the main function of the package that calls mutations by comparing at each genomic position the SNV or INDEL frequencies computed in one tested sample to
#' the SNV or INDEL Position-Error Rates computed from several control samples by a binomial test. An outlier detection is performed among all intra-sample p-values
#' to call a mutation.
#' For users wishing to develop their own analysis for other sequencing panel, it requires recalibrated BAM files control samples to be processed to compute the
#' Position-Error Rates stored in a file specified in \code{ber.ctrl.file}.
#'
#' @param patient.dir, char, foldername containing the rBAM folder of the patients. The typical folder hierarchy will consist of 'Plasma/rBAM'
#' @param patient.name, char, filename of the patient .bam file(s) (default NULL read all patients in folder \code{patient.dir})
#' @param pos_ranges.file, char, name of the Rdata file containing the three variables \code{pos_ind}, \code{pos_snp}, \code{pos_ranges} as build by the function \code{PrepareLibrary}. Default NULL, use the position_ranges.rda provides that we used for our analysis.
#' @param ber.ctrl.file, char, pathname of the file providing the background error rates obtained from the controls (default NULL use the provided background error rates obtained from our 29 controls). See \code{background_error_rate.txt} data and \code{BuildCtrlErrorRate} function.
#' @param bai.ext, char, filename extension of the bai files (default '.bai')
#' @param n.trim, integer, number of base positions trimmed at the ends of each amplicon (default 8)
#' @param cov.min, integer, minimal coverture required at each position (default 0)
#' @param force, boolean, (default FALSE) if TRUE force all computations to all files including already processed ones
#' @param show.more, boolean, (default FALSE show only detected positions) if TRUE additional annotations on result plots are given for non-significant mutations
#' @param qcutoff.snv, numeric, proportion of kept base positions ranged by increasing 95th percentile SNV PER in control samples (default 0.95)
#' @param qcutoff.indel, numeric, proportion of kept base positions ranged by increasing 95th percentile INDEL PER in control samples (default 0.99)
#' @param cutoff.sb.ref, numeric, exclude reference positions without cutoff < strand bias < 1-cutoff (default 0.1) (see Supplementary Materials in References)
#' @param cutoff.sb.hotspot, numeric, exclude hotspot positions with Symmetric Odds Ratio test > cutoff (default 3.1) (see Supplementary Materials in References)
#' @param cutoff.sb.nonhotspot, numeric, exclude non-hotspot positions with Symmetric Odds Ratio test > cutoff (default 2.5) (see Supplementary Materials in References)
#' @param hotspot.indel, char, a vector containing the known positions of hotspot deletion/insertion defined as chrX:start:end (default 'chr7:55227950:55249171')
#' @param output.dir, char, name of the folder to save results  (default \code{patient.dir}).
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
#'
#' @return the number of processed patients
#'
#' @export DetectPlasmaMutation
#'
#' @examples
#'      patient.dir=system.file("extdata","4test_only/case/",package="PlasmaMutationDetector")
#'      if (substr(patient.dir,nchar(patient.dir),nchar(patient.dir))!='/')
#'        patient.dir = paste0(patient.dir,'/') # TO RUN UNDER WINDOWS
#'      DetectPlasmaMutation(patient.dir,output.dir=paste0(tempdir(),'/'))
#'
#'
DetectPlasmaMutation = function(patient.dir='./',patient.name=NULL,pos_ranges.file=NULL,ber.ctrl.file=NULL,bai.ext='.bai',
                               n.trim=8,cov.min=0,force=FALSE,show.more=FALSE,
                               qcutoff.snv=0.95,qcutoff.indel=0.99,
                               cutoff.sb.ref=0.1,cutoff.sb.hotspot=3.1,cutoff.sb.nonhotspot=2.5,
                               hotspot.indel='chr7:55227950:55249171',output.dir=patient.dir) {

  ### read and adapt background error rate
  background_error_rate = LoadBackgroundErrorRate(pos_ranges.file,ber.ctrl.file,n.trim)

  ### Identify the so-called noisy SNV and INDEL genomic positions having background error rate higher than an upper quantile.
  ### Quantiles are defined by the cut-off qcutoff.snv and qcutoff.indel
  noisy.snv = background_error_rate[UpperPart(background_error_rate$up.sain, qcutoff.snv),'chrpos']
  noisy.indel = background_error_rate[UpperPart(background_error_rate$indel.up.sain,qcutoff.indel),'chrpos']

  ### extract counts from BAM files
  patient.files = MAF_from_BAM(study.dir=patient.dir,input.filenames=patient.name,bai.ext=bai.ext,
                               pos_ranges.file=pos_ranges.file,force=force,output.dir=output.dir)

  ### directory for the results
  patient.results = paste0(output.dir,'Results/')
  if (!dir.exists(patient.results)) dir.create(patient.results)

  for (i in 1:length(patient.files)) {

    print(paste('Detecting mutations from', patient.files[i]))

    patient.name = rev(strsplit(patient.files[i],'/')[[1]])[1]
    patient.pdf = gsub('_MAF.txt$','.pdf',paste0(patient.results,patient.name))
    patient.hit = gsub('_MAF.txt$','_infos.txt',paste0(patient.results,patient.name))
    patient.mut = gsub('_MAF.txt$','_MUT.txt',paste0(patient.results,patient.name))

    if ((!force & file.exists(patient.hit)) && (file.size(patient.hit)>0)) next() # already processed go to next

    ### Load patient file
    sample = read.table(patient.files[i], header=T, sep="") ### read sample counts
    sample = sample[which(!is.na(sample$MA)),] ### remove positions which haven't been read
    depth = mean(sample$cov,na.rm=T)

    ### Find colums of interest
    ATGC.ind = which(colnames(sample) %in% c('A','T','G','C'))
    ATGC = colnames(sample)[ATGC.ind]
    INDEL.ind = which(colnames(sample) %in% c('longdel','longins'))
    INDEL = colnames(sample)[INDEL.ind]

    ### merge control and sample counts
    samplenoise = merge(sample, background_error_rate, by="chrpos", all=FALSE, suffixes=c('','.b'))

    ### Compute p-values
    samplenoise$probSNV = apply(cbind(c2n(samplenoise$MA),c2n(samplenoise$cov),c2n(samplenoise$E0),c2n(samplenoise$N0)),1, pvalue) # for SNV
    samplenoise$probINDEL = apply(cbind(c2n(samplenoise$longINDEL),c2n(samplenoise$cov),c2n(samplenoise$E0indel),c2n(samplenoise$N0)),1, pvalue) # for INDEL

    ### local function made out of samplenoise
    FindColumn = function(i,cols.ind,col.ref,col.names=NULL){
      ind = which(samplenoise[i,cols.ind]==samplenoise[i,col.ref])[1]
      if (is.null(col.names)) {
        ind = cols.ind[ind]
      } else{
        ind = col.names[ind]
      }
      ind
    }

    ### At each position, start by using SNV reference as default values:
    samplenoise[,'type'] = 'SNV'
    samplenoise[,'hprob'] = samplenoise[,'probSNV']

    ### Identify the REFERENCE allele and the mutated allele:
    samplenoise$Baseref = sapply(1:nrow(samplenoise),FindColumn,ATGC.ind,'maj',ATGC)
    samplenoise$Basemut = sapply(1:nrow(samplenoise),FindColumn,ATGC.ind,'MA',ATGC)
    for (q in 1:nrow(samplenoise)) {
      # ### 1.7.0 : to deal with tumoral fraction higher than 50%
      if (as.character(samplenoise$Basemut[q])==as.character(samplenoise$REF[q])) {
        print(samplenoise[q,])
        print(paste('Inversion of reference at position',samplenoise$chrpos[q]))
        samplenoise$Basemut[q] = as.character(samplenoise$Baseref[q])
        samplenoise$Baseref[q] = as.character(samplenoise$REF[q])
        samplenoise$hprob[q] = samplenoise$probSNV[q] = pvalue(c2n(samplenoise$maj[q]),c2n(samplenoise$cov[q]),c2n(samplenoise$E0[q]),c2n(samplenoise$N0[q]))
      }
    }

    ### Correct the INDEL that is the non SNV
    is.INDEL = (samplenoise$probINDEL < samplenoise$probSNV)
    ### Decipher the most significant alteration between SNV, DELETION>2bp or INSERTION>2bp
    is.longins = (samplenoise$longins > samplenoise$longdel)
    samplenoise[which(is.INDEL & is.longins),'type'] = "insertion"
    samplenoise[which(is.INDEL & !is.longins),'type'] = "deletion"
    samplenoise[which(is.INDEL),'hprob'] = samplenoise[which(is.INDEL),'probINDEL']

    ### At each position, compute Strand Bias scores for ref, major variant and indel
    for (q in 1:nrow(samplenoise)) {
      ### compute reference strand bias
      # print(paste('q =',q))
      # print(paste0(samplenoise[q,'Baseref'],"_SB"))
      # print(samplenoise[q,])
      samplenoise[q,'SBref'] = samplenoise[q,paste0(samplenoise[q,'Baseref'],"_SB")]
      refplus = samplenoise[q,paste0(samplenoise[q,'Baseref'],'_plus')]
      refmin = samplenoise[q,paste0(samplenoise[q,'Baseref'],'_minus')]

      ### compute strand bias scores ...
      ### ... for major alteration
      if (samplenoise[q,'type']=='SNV') {
        altplus = samplenoise[q,paste0(samplenoise[q,'Basemut'],'_plus')]
        altmin = samplenoise[q,paste0(samplenoise[q,'Basemut'],'_minus')]
        samplenoise[q,'SBscoreSNV'] = sbscore(refplus, refmin, altplus, altmin)
        samplenoise[q,'hSB'] = samplenoise[q,paste0(samplenoise[q,'Basemut'],"_SB")] # WAS samplenoise[q,'SBsnv']
      }

      ### ... for INDEL
      if (samplenoise[q,'type']!='SNV') {
        altplus = samplenoise[q,paste0(samplenoise[q,'type'],'_plus')]
        altmin = samplenoise[q,paste0(samplenoise[q,'type'],'_minus')]
        samplenoise[q,'SBscoreINDEL'] = sbscore(refplus, refmin, altplus, altmin)
        samplenoise[q,'hSB'] = samplenoise[q,paste0(FindColumn(q,INDEL.ind,'longINDEL',INDEL),"_SB")] # WAS samplenoise[q,'SBindel']
      }

    }

    ### Correction for graphical outputs
    ### Correct the insertions
    samplenoise[which(samplenoise$type=='insertion'),'Baseref'] = 'INS'
    samplenoise[which(samplenoise$type=='insertion'),'Basemut'] = '>2N'

    ### Correct the deletions: length and strand biais
    ### Need to gather consecutive deletions>2bp (sharing the same contig number) into one unique large deletion table
    ind.deletion = which(samplenoise$type=='deletion')
    if (length(ind.deletion)>0) {
      deletion = samplenoise[ind.deletion,]
      for (z in unique(deletion$contig)) {
        deletion$Baseref = 'DEL'
        bool = (deletion$contig==z)
        ind = which(bool)
        if (length(ind)>0) {
          deletion[ind,'Basemut'] = c2n(max(deletion[ind,][[3]])) + 1 - c2n(min(deletion[ind,][[3]])) # gives the number of deleted bp
          deletion[ind,'hprob'] = min(deletion[ind,'probINDEL']) # retains the most significant p-value (p.INDEL) on the large deletion
          deletion[ind,'SBscoreINDEL'] = min(deletion[ind,'SBscoreINDEL']) # retains the lowest Symmetric Odds Ratio Test on the large deletion
          deletion[which(bool & deletion$chrpos %in% noisy.indel),'hprob'] = NA
        }
      }
      deletion = deletion[which(deletion$Basemut!=1),] #ensure that no deletion is <3bp
      samplenoise = rbind(samplenoise[-which(samplenoise$type=='deletion'),], deletion[!duplicated(deletion$contig),])
    }

    ### Remove positions with reference showing outlier strand bias
    samplenoise[which(samplenoise$SBref > 1-cutoff.sb.ref | samplenoise$SBref < cutoff.sb.ref), 'hprob'] = NA

    ### Remove positions showing too high strand bias score (Symmetric Odds Ratio Test)
    samplenoise[which(samplenoise$SBscoreINDEL > cutoff.sb.hotspot), 'hprob'] = NA # for INDEL
    samplenoise[which((samplenoise$hotspot=="Hotspot") & (samplenoise$SBscoreSNV > cutoff.sb.hotspot)), 'hprob'] = NA # for SNV in hotspot
    samplenoise[which((samplenoise$hotspot=="Non-hotspot") & (samplenoise$SBscoreSNV > cutoff.sb.nonhotspot)), 'hprob'] = NA # for SNV in non-hotspot

    ### Remove positions showing 5% highest background error rate in SNV or in INDEL
    type.snv = (samplenoise$type == 'SNV')
    samplenoise[which(type.snv & samplenoise$chrpos %in% noisy.snv ), 'hprob'] = NA
    samplenoise[which(!type.snv & samplenoise$chrpos %in% noisy.indel), 'hprob'] = NA


    ### keep non removed positions (hprob not NA)
    samplenoise = samplenoise[-which(is.na(samplenoise$hprob)),]

    ### compute the frequency of the mutation
    type.snv = (samplenoise$type == 'SNV')
    ind = which(type.snv)
    samplenoise[ind,'Freq'] = samplenoise[ind,'MA']/samplenoise[ind,'cov'] # compute the frequency of the mutation
    ind = which(!type.snv)
    samplenoise[ind,'Freq'] = samplenoise[ind,'longINDEL']/samplenoise[ind,'cov'] # compute the frequency of the mutation

    ### First outlier detection
    samplenoise$outlier="no"
    samplenoise[which(samplenoise$chrpos %in% find.outliers(samplenoise$hprob, samplenoise$chrpos)),'outlier']="outlier"  ##### <<<< PB
    nb.outliers=length(samplenoise$outlier[which(samplenoise$outlier!="no")])

    ### Second, we deal now with Hotspots on the smallest p-values
    # find the 250 smallest p-values
    ind = order(samplenoise$hprob, decreasing=FALSE)[1:min(250,nrow(samplenoise))]
    small.pval = samplenoise[ind,'hprob']
    log.small.pval = minus.logit(small.pval)

    #
    # find the upper outliers of a robust boxplot of the finite logit p-values
    upper.wisker = robustbase::adjbox(log.small.pval[is.finite(log.small.pval)], plot=F)$stats[5,1]
    uppers = ind[log.small.pval > upper.wisker]
    for (q in uppers) {
      was.outlier = (samplenoise[q,'outlier']!='no')
      if (samplenoise$type[q]=='SNV' & samplenoise$hotspot[q]=='Hotspot') samplenoise[q,'outlier']="outlier_hotspot"
      if (samplenoise$type[q]!='SNV')
        for (pos in hotspot.indel) { # check if the position is a known indel hotspot positions
          tmp = regmatches(pos,gregexpr(':',pos),invert=T)[[1]]
          if (samplenoise$seqnames[q]==tmp[1] & c2n(samplenoise$start[q])>=c2n(tmp[2]) & c2n(samplenoise$start[q])<=c2n(tmp[3]))
            samplenoise[q,'outlier']="outlier_hotspot"
        }
      if (!was.outlier & samplenoise[q,'outlier']=="outlier_hotspot") nb.outliers=nb.outliers+1
    }

    write.table(samplenoise, file=patient.hit, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
    mutations = samplenoise[which(samplenoise$outlier!='no'),c('chrpos','type','Baseref','Basemut','Freq','hotspot','hprob')]
    colnames(mutations) = c('Genomic pos','Mutation type','Base ref','Base mut','Allelic freq','hotspot','p-value')
    write.table(mutations, file=patient.mut, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)

    ### For plotting purpose: all possible mutation types are added for legend consistency
    rownames(samplenoise) = 1:nrow(samplenoise)
    samplenoise[nrow(samplenoise)+1,c('hotspot','outlier','type')]=c('Hotspot','outlier','deletion')
    samplenoise[nrow(samplenoise)+1,c('hotspot','outlier','type')]=c('Non-hotspot','no','insertion')
    samplenoise[nrow(samplenoise)+1,c('hotspot','outlier','type')]=c('Non-hotspot','outlier_hotspot','insertion')

    ### For plotting pupose: make type and outlier as.factor with ordered levels
    USNT = unique(samplenoise$type) # YVES 5/09/2016
    samplenoise$type = factor(samplenoise$type, levels = USNT[order(USNT, decreasing=TRUE)]) # YVES 5/09/2016
    USNO = unique(samplenoise$outlier) # YVES 5/09/2016
    samplenoise$outlier = factor(samplenoise$outlier, levels = USNO[order(USNO)]) # YVES 5/09/2016

    ### Plot generation using ggplot2
    grob = grobTree(textGrob(paste0("Mean Depth of Coverage: ", floor(depth),"x"),
                             x=0.45,  y=0.97, hjust=0, gp=gpar(col="red", fontsize=9, fontface="italic")))

    show.pos = (samplenoise$outlier=="outlier_hotspot" | samplenoise$outlier=="outlier")
    if (show.more)
      show.pos = show.pos | ((samplenoise$hotspot=="Hotspot" | samplenoise$Baseref=="DEL" | samplenoise$Baseref=="INS") & samplenoise$hprob<0.000001 & samplenoise$hSB <= 1-cutoff.sb.ref & samplenoise$hSB >= cutoff.sb.ref)

    ind = rev(gregexpr('/',patient.name)[[1]])[1]
    Freq4plot = substr(paste(samplenoise$Freq), 0, 6) # keep 3 decimal for the frequency of the mutation
    pdf = ggplot(samplenoise, aes(sapply(samplenoise$hprob, function(x) -log(max(1e-300,x))), samplenoise$hSB)) +
      geom_point(aes(colour=samplenoise$hotspot, size=samplenoise$outlier, shape=samplenoise$type), na.rm=T) +
      geom_text(aes(label=ifelse(show.pos,
            paste0(as.character(samplenoise$chrpos),".",samplenoise$Baseref,">",samplenoise$Basemut, " AF ", Freq4plot),
            '')),hjust=0, vjust=0.5, angle=65, size=2.5, na.rm=T)  +
      ylab("Strand Biais") + xlab(paste0("-log(p-value)")) + theme_set(theme_bw(base_size = 10)) +
      labs(title = paste(nb.outliers,"Outlier(s)", "\n", substr(patient.name, ind+1, ind+26))) + scale_y_continuous(limits=c(0, 1.0))  +
      scale_x_continuous(limits=c(0, 10e2) ) + annotation_custom(grob) + scale_shape_manual(name="Class", values = c(16, 25, 17)) +
      scale_colour_brewer(name="Hotspot", palette="Set1") + scale_size_manual(name="Outlier", values=c(0.8,5,3))

    # Export results plot and table
    ggsave(pdf, filename=patient.pdf, width = 10, height = 10)

  } # end of loop over patients

  length(patient.files)

}


#' The package provide the SNV and INDEL PERs computed for the Ion AmpliSeq™ Colon and Lung Cancer Panel v2 from 29 controls in a table available in the data file \code{background_error_rate.txt}.
#'
#' This table contains 9 variables for each genomic position
#' \itemize{
#'   \item \code{chrpos}, char, of the form chrN:XXXXXXXXX defining genomic position
#'   \item \code{N0}, integer, the coverture in the controls
#'   \item \code{E0}, integer, the number of errors in the controls
#'   \item \code{p.sain}, numeric,  the ratio E0/N0
#'   \item \code{up.sain}, numeric, the 95th quantile of the Binomial with parameter N0 and E0/N0
#'   \item \code{E0indel}, integer, the amount of indel
#'   \item \code{indel.p.sain}, numeric, the ration E0indel/N0
#'   \item \code{indel.up.sain}, numeric, the 95th quantile of the Binomial with parameter N0 and E0indel/N0
#'   \item \code{hotspot}, char, either 'Non-hotspot' or 'Hotspot' depending if the genomic position is known as hotspot or not.
#' }
#
#' @docType data
#' @usage data(background_error_rate)
#' @name background_error_rate
#' @aliases background_error_rate.txt
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @keywords data
#' @seealso \code{BuildCtrlErrorRate}
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
NULL

#' The package provide the positions and ranges computed for the Ion AmpliSeq™ Colon and Lung Cancer Panel v2 as a Rdata file \code{positions_ranges.rda}.
#'
#' This file contains 4 variables
#' \itemize{
#'   \item \code{pos_ind}, vector of chars, of the form chrN:XXXXXXXXX defining genomic positions of the Ion AmpliSeq™ Colon and Lung Cancer Panel v2
#'   \item \code{pos_snp}, vector of chars, of the form chrN:XXXXXXXXX defining the known snp genomic positions
#'   \item \code{pos_ranges}, GRanges object, describing the 92 amplicons of the Ion AmpliSeq™ Colon and Lung Cancer Panel v2
#' }
#
#' @docType data
#' @usage data(positions_ranges)
#' @name positions_ranges
#' @aliases positions_ranges.rda pos_ind pos_snp pos_ranges
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @keywords data
#' @seealso \code{Prepare_Library}
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
NULL

#' The package provide a list of known hotspot positions located on the amplicons of the Ion AmpliSeq™ Colon and Lung Cancer Panel v2 as a txt file \code{hotspot.txt} which contains a vector/variable ---named chrpos (first row)--- of chars, of the form chrN:XXXXXXXXX defining genomic positions.
#'
#
#' @docType data
#' @usage data(hotspot)
#' @name hotspot
#' @aliases hotspot.txt
#' @author N. Pécuchet, P. Laurent-Puig and Y. Rozenholc
#' @keywords data
#' @references \emph{Analysis of base-position error rate of next-generation sequencing to detect tumor mutations in circulating DNA} N. Pécuchet, Y. Rozenholc, E. Zonta, D. Pietraz, A. Didelot, P. Combe, L. Gibault, J-B. Bachet, V. Taly, E. Fabre, H. Blons, P. Laurent-Puig in \emph{Clinical Chemistry}
NULL
Any scripts or data that you put into this service are public.
PlasmaMutationDetector documentation built on May 2, 2019, 10:10 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PlasmaMutationDetector
Tumor Mutation Detection in Plasma

R/functions.R
In PlasmaMutationDetector: Tumor Mutation Detection in Plasma

Defines functions c2n sbscore pvalue minus.logit find.outliers UpperPart pileupFreq PrepareLibrary MAF_from_BAM BuildCtrlErrorRate LoadBackgroundErrorRate DetectPlasmaMutation

Documented in BuildCtrlErrorRate DetectPlasmaMutation LoadBackgroundErrorRate MAF_from_BAM PrepareLibrary

Try the PlasmaMutationDetector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PlasmaMutationDetector Tumor Mutation Detection in Plasma

R/functions.R In PlasmaMutationDetector: Tumor Mutation Detection in Plasma

Defines functions c2n sbscore pvalue minus.logit find.outliers UpperPart pileupFreq PrepareLibrary MAF_from_BAM BuildCtrlErrorRate LoadBackgroundErrorRate DetectPlasmaMutation

Documented in BuildCtrlErrorRate DetectPlasmaMutation LoadBackgroundErrorRate MAF_from_BAM PrepareLibrary

Try the PlasmaMutationDetector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PlasmaMutationDetector
Tumor Mutation Detection in Plasma

R/functions.R
In PlasmaMutationDetector: Tumor Mutation Detection in Plasma