R/wes_process.R

Defines functions genome.build.finder loc.nt.gcc.hs.multi loc.nt.gcc.hs loc.nt.count.hs bedBinner WES.Normalize.ff.Batch WES.Normalize.ff WES.Normalize WES.Bin.Batch WES.Bin BINpack.Maker

Documented in BINpack.Maker WES.Bin WES.Bin.Batch WES.Normalize WES.Normalize.ff WES.Normalize.ff.Batch

## Generates a BINpack (bins and pre-computed GC tracks) from a capture bed
BINpack.Maker <- function(bed.file = NULL, bin.size = 50, genome.pkg = "BSgenome.Hsapiens.UCSC.hg19", extend.multi = c(0, 50, 100, 200, 400, 800, 1600, 3200, 6400), blocksize = 1E+04, nthread = 1, out.dir = getwd(), return.data = FALSE) {

  # setwd("/mnt/data_cigogne/job/PUBLI_EaCoN/TCGA/RESOURCES/test/")
  # bed.file <- "SureSelect_ClinicalResearchExome.padded_GRCh37-lite_merged_sorted.bed"
  # # bed.file = "test1.bed"
  # bin.size = 50
  # genome.pkg = "BSgenome.Hsapiens.1000genomes.hs37d5"
  # extend.multi = c(0, 100, 500, 1000, 2000, 4000)
  # blocksize = 1E+04
  # nthread = 5
  # out.dir = getwd()
  # return.data = FALSE
  # source("~/git_gustaveroussy/EaCoN/R/BED_functions.R")
  # source("~/git_gustaveroussy/EaCoN/R/wes_process.R")
  # source("~/git_gustaveroussy/EaCoN/R/mini_functions.R")
  
    ## Checks
  if (is.null(bed.file)) stop("A BED file is required !", call. = FALSE)
  if (!file.exists(bed.file)) stop("Could not find the BED file !", call. = FALSE)
  if (is.null(out.dir)) message("NOTE : Checked / cleaned bed will be written in the same directory as source.") else {
    if (!file.exists(out.dir)) stop("Could not find the output directory !", call. = FALSE)
    if (!file.info(out.dir)$isdir) stop("out.dir is not a directory !", call. = FALSE)
  }
  if (is.null(genome.pkg)) stop(tmsg("A BSgenome package name is required !"), call. = FALSE)
  if (!genome.pkg %in% BSgenome::installed.genomes()) {
    if (genome.pkg %in% BSgenome::available.genomes()) {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " available but not installed. Please install it !")), call. = FALSE)
    } else {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " not available in valid BSgenomes and not installed ... Please check your genome name or install your custom BSgenome !")), call. = FALSE)
    }
  }
  
  bin.size <- as.integer(bin.size)
  extend.multi <- as.integer(extend.multi)
  
  ### Loading genome
  message(paste0("Loading ", genome.pkg, " ..."))
  suppressPackageStartupMessages(require(genome.pkg, character.only = TRUE))
  BSg.obj <- getExportedValue(genome.pkg, genome.pkg)
  # genome <- BSgenome::providerVersion(BSg.obj)
  genome <- metadata(BSg.obj)$genome
  organism <- BSgenome::organism(BSg.obj)


  ## Cleaning BED
  bed.clean <- BedCheck(bed.file = bed.file, genome.pkg = genome.pkg, out.dir = out.dir, return.data = TRUE)

  ## Binning
  message(paste0("Performing binning (", bin.size, ") ..."))
  bed.binned <- bedBinner(bed = bed.clean, bin.size = bin.size, nthread = nthread)

  bed.binned <- data.frame(ProbeSetName = seq_len(nrow(bed.binned)), bed.binned, stringsAsFactors = TRUE)

  message("Generating GC% tracks ...")
  wes.gc <- loc.nt.gcc.hs.multi(loc.df = bed.binned, genome.pkg = genome.pkg, extend.multi = extend.multi, blocksize = blocksize, nthread = nthread)

  rm(bed.binned)
  
  
  wes.meta.key <- c("genome-species", "genome-version", "genome-package", "array_type", "track_type", "bin_size")
  wes.meta.value <- c(organism, genome, genome.pkg, "WES", "GC", bin.size)
  # renorm.data <- list(tracks = wes.gc, info = list("genome-version" = genome, "genome-package" = genome.pkg, bin.size = bin.size, track.type = "GC"), bed.clean = bed.clean, bed.binned = bed.binned)
  renorm.data <- list(tracks = wes.gc, info = data.frame(key = wes.meta.key, value = wes.meta.value, stringsAsFactors = FALSE), bed.clean = bed.clean)
  rm(wes.gc, wes.meta.value, wes.meta.key, bed.clean)
  
  save("renorm.data", file = paste0(out.dir, "/", sub(pattern = "\\.bed$", replacement = paste0("_", genome, "_b", bin.size, ".GC.rda"), x = basename(bed.file), ignore.case = TRUE)), compress = "xz")

  message("Done.")
  if (return.data) return(renorm.data)
}


WES.Bin <- function(testBAM = NULL, refBAM = NULL, BINpack = NULL, samplename = "SAMPLE", Q = 20, nsubthread = 1, cluster.type = "PSOCK", out.dir = getwd(), return.data  = FALSE, write.data = TRUE, plot = TRUE, force = FALSE) {
  
  # setwd("/home/job/WORKSPACE/EaCoN_tests/WES/AlexandreLefranc/Results")
  # # setwd("/home/job/WORKSPACE/EaCoN_tests")
  # testBAM <- "/home/job/WORKSPACE/EaCoN_tests/WES/AlexandreLefranc/DATA/Sample_PHEO_AG_HS_048_DNA.reord.sorted.dedup.recal.reheaded.bam"
  # refBAM <- "/home/job/WORKSPACE/EaCoN_tests/WES/AlexandreLefranc/DATA/Sample_PHEO_AG_HS_048G_DNA.reord.sorted.dedup.recal.reheaded.bam"
  # # # # BINpack <- "/mnt/data_cigogne/job/PUBLI_EaCoN/MATCHR/RESOURCES/SureSelect_ClinicalResearchExome.padded_hg19_b50.rda"
  # # # BINpack <- "/mnt/data_cigogne/job/PUBLI_EaCoN/TCGA/RESOURCES/SureSelect_ClinicalResearchExome.padded_hs37d5_b50.GC.rda"
  # BINpack <- "/home/job/WORKSPACE/EaCoN_tests/WES/AlexandreLefranc/Results/V4-UTRs.hg38.fragment_targets_minimal_sorted_longChr_hg38_b50.GC.rda"
  # samplename <- "ALEX1"
  # Q <- 20
  # out.dir = getwd()
  # nsubthread = 4
  # cluster.type = "PSOCK"
  # return.data = FALSE
  # write.data = TRUE
  # plot = TRUE
  # source("/home/job/git_gustaveroussy/EaCoN/R/mini_functions.R")
  # source("/home/job/git_gustaveroussy/EaCoN/R/BED_functions.R")
  # source("/home/job/git_gustaveroussy/EaCoN/R/wes_process.R")
  # suppressPackageStartupMessages(require(foreach))
  # require(magrittr)
  
  ## CHECKS (files/parameters)
  if (is.null(BINpack)) stop(tmsg("A BINpack file is required !"), call. = FALSE)
  if (!file.exists(BINpack)) stop(tmsg("Could not find the BINpack file !"), call. = FALSE)
  if (is.null(refBAM)) stop(tmsg("A reference BAM file is required !"), call. = FALSE)
  if (!file.exists(refBAM)) stop(tmsg("Could not find the refBAM file !"), call. = FALSE)
  if (is.null(testBAM)) stop(tmsg("A test BAM file is required !"), call. = FALSE)
  if (!file.exists(testBAM)) stop(tmsg("Could not find the testBAM file !"), call. = FALSE)
  if (!is.numeric(Q)) stop(tmsg("Q must be numeric !"), call. = FALSE)
  if (is.null(out.dir)) stop(tmsg("An output directory is required !"), call. = FALSE)
  if (!file.exists(out.dir)) stop(tmsg("Could not find the output directory !"), call. = FALSE)
  if (!file.info(out.dir)$isdir) stop(tmsg("out.dir is not a directory"), call. = FALSE)
  if (!return.data & !write.data) stop(tmsg("Data should be returned and/or written on disk, but not none !"), call. = FALSE)
  if (Q < 0) stop(tmsg("Q should be positive !"), call. = FALSE)
  
  ## WARNINGS
  if (return.data) tmsg("Data will be returned.")
  if (write.data) tmsg("Data will be written on disk.")
  if (!plot) tmsg("No plot will get drawn.")
  
  
  ## Loading binpack
  tmsg("Loading BINpack ...")
  load(BINpack)
  
  ## CHECKS (genome)
  # genome.pkg <- GC.data$info$genome.pkg
  genome.pkg <- renorm.data$info$value[renorm.data$info$key == "genome-package"]
  if (!genome.pkg %in% BSgenome::installed.genomes()) {
    if (genome.pkg %in% BSgenome::available.genomes()) {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " available but not installed. Please install it !")), call. = FALSE)
    } else {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " not available in valid BSgenomes and not installed ... Please check your genome name or install your custom BSgenome !")), call. = FALSE)
    }
  }
  if (dir.exists(samplename)) { if (!force) stop(tmsg(paste0("A [", samplename, '] dir already exists !')), call. = FALSE) else unlink(samplename, recursive = TRUE, force = FALSE) }
  
  ### Loading genome
  tmsg(paste0("Loading ", genome.pkg, " ..."))
  suppressPackageStartupMessages(require(genome.pkg, character.only = TRUE))
  BSg.obj <- getExportedValue(genome.pkg, genome.pkg)
  # genome <- BSgenome::providerVersion(BSg.obj)
  genome <- metadata(BSg.obj)$genome
  ## Files controls
  tmsg("Checking BINpack and BAMs compatibility ...")
  
  ## Inspecting BAM headers
  refBAM.h <- Rsamtools::scanBamHeader(refBAM)
  testBAM.h <- Rsamtools::scanBamHeader(testBAM)
  bed.data <- renorm.data$tracks[,1:4]
  renorm.data$tracks <- NULL
  gc()
  
  if (!all(names(refBAM.h[[1]]$targets) %in% names(testBAM.h[[1]]$targets))) stop(tmsg("Reference BAM and Test BAM are not compatible (different chr names) !"), call. = FALSE)
  if (!all(bed.data$chr %in% names(refBAM.h[[1]]$targets))) stop(tmsg("Reference BAM and BED are not compatible (different chr names) !"), call. = FALSE)
  if (!all(unique(bed.data$chr) %in% names(testBAM.h[[1]]$targets))) stop(tmsg("Test BAM and BED are not compatible (different chr names) !"), call. = FALSE)
  if (!all(unique(bed.data$chr) %in% BSgenome::seqnames(BSg.obj))) stop(tmsg("BED and BSgenome are not compatible (different chr names) !"), call. = FALSE)
  
  ## Identifying the platform
  testBAM.h.unl <- unlist(testBAM.h)
  manuf.hentry <- grep(pattern = "^PL:", testBAM.h.unl)[1]
  manufacturer <- if(!is.na(manuf.hentry)) sub(pattern = "^PL:", replacement = "", testBAM.h.unl[[manuf.hentry]]) else "NA"
  
  meta.b <- list(
    samplename = samplename,
    source = "WES",
    source.file = list(refBAM = refBAM, testBAM = testBAM, BINpack = BINpack),
    type = "WES",
    manufacturer = manufacturer,
    species = GenomeInfoDb::organism(BSg.obj),
    genome = genome,
    genome.pkg = genome.pkg,
    predicted.gender = "NA"
  )
  
  meta.w <- list(
    testBAM.header = paste0(testBAM.h, collapse = " "),
    refBAM.header = paste0(refBAM.h, collapse = " "),
    samtools.Q = Q,
    # bin.size = GC.data$info$bin.size
    bin.size = as.numeric(renorm.data$info$value[renorm.data$info$key == "bin_size"])
  )
  rm(testBAM.h, refBAM.h)
  
  ### Common BAM flags
  param.FLAG <- Rsamtools::scanBamFlag(isSecondaryAlignment=FALSE, isNotPassingQualityControls=FALSE, isDuplicate=FALSE)
  param.PILEUP <- Rsamtools::PileupParam(distinguish_strands = FALSE, max_depth = 5E+04, min_base_quality = Q, min_nucleotide_depth = 0, distinguish_nucleotides = TRUE)
  
  bedbam2pup <- function(BamFile = NULL, bed.data = NULL, scanBamFlag = NULL, pileupParam = NULL) {
    
    ### Making pileup
    tmsg("   Getting pileup ...")
    param.BAM <- Rsamtools::ScanBamParam(which = GenomicRanges::makeGRangesFromDataFrame(bed.data, seqnames.field = "chr"), flag = scanBamFlag)
    pres <- dplyr::as.tbl(Rsamtools::pileup(BamFile, scanBamParam = param.BAM, pileupParam = pileupParam))
    colnames(pres)[c(1,5)] <- c("chr", "bin")
    levels(pres$bin) <- bed.data$ProbeSetName
    pres$bin <- as.integer(as.character(pres$bin))
    
    # levels(pres$bin) <- rownames(bed.data)
    
    gc()
    
    if("=" %in% unique(pres$nucleotide)) tmsg("   Bam contains the '=' sign !")
    
    ### Building genomic sequence of reference block
    tmsg("   Getting reference genome sequence ...")
    refblock <- pres[!duplicated(pres$pos), c(1,2)]
    pres <- dplyr::group_by(pres, pos)
    refblock$tot_count <- dplyr::summarize(pres, tot_count = sum(count))$tot_count
    pres <- dplyr::ungroup(pres)
    refblock$nucleotide <- as.factor(BSgenome::getSeq(BSg.obj, names = GenomicRanges::makeGRangesFromDataFrame(refblock, start.field = "pos", end.field = "pos"), as.character = TRUE))
    
    ### Merging blocks
    tmsg("   Computing alt counts ...")
    # refblock <- dplyr::group_by(refblock, pos, nucleotide)
    # pres <- dplyr::group_by(pres, pos, nucleotide)
    merged <- suppressWarnings(dplyr::left_join(refblock, pres, by = c("chr", "pos", "nucleotide")))
    rm(pres, refblock)
    gc()
    bed.based <- dplyr::as.tbl(data.frame(chr = unique(bed.data$chr), pos = as.integer(unlist(seq.int2(from = bed.data$start, to = bed.data$end, by = 1))), bin = rep(bed.data$ProbeSetName, times = (bed.data$end - bed.data$start +1))))
    bed.joint <- suppressWarnings(dplyr::left_join(bed.based, merged, c("chr", "pos", "bin"))) ### yeah !
    rm(bed.based)
    gc()
    ### Cleaning and formatting
    bed.joint$nucleotide <- NULL
    bed.joint$chr <- as.factor(bed.joint$chr)
    # bed.joint$which_label <- as.factor(bed.joint$which_label)
    
    bed.joint$count <- bed.joint$tot_count - bed.joint$count
    colnames(bed.joint) <- c("chr", "pos", "bin", "tot_count", "alt_count")
    
    bed.joint$tot_count[is.na(bed.joint$tot_count)] <- 0L
    bed.joint$alt_count[is.na(bed.joint$alt_count)] <- 0L
    # bed.joint$nt[is.na(bed.joint$nt)] <- bed.based$nt[is.na(bed.joint$nt)]

    return(bed.joint)
  }
  
  pileup.go <- function(testBAM = NULL, refBAM = NULL, bed.data = NULL, scanBamFlag = NULL, pileupParam = NULL, nsubthread = 1, cluster.type = "PSOCK") {
    
    # BamFile <- testBAM
    # scanBamFlag <- param.FLAG
    # pileupParam <- param.PILEUP
    # nsubthread <- 1
    # cluster.type = "PSOCK"

    #### Indexing BAM if needed
    if (!(file.exists(paste0(testBAM, ".bai")) || file.exists(sub(pattern = "\\.bam", replacement = ".bai", x = testBAM, ignore.case = TRUE)))) {
      tmsg("Indexing Test BAM ...")
      Rsamtools::indexBam(testBAM) 
    } else tmsg("Test BAM is already indexed.")
    if (!(file.exists(paste0(refBAM, ".bai")) || file.exists(sub(pattern = "\\.bam", replacement = ".bai", x = refBAM, ignore.case = TRUE)))) {
      tmsg("Indexing Ref BAM ...")
      Rsamtools::indexBam(refBAM)
    } else tmsg("Ref BAM is already indexed.")
    
    #### Launching cluster
    if (length(unique(bed.data$chr)) < nsubthread) nsubthread <- length(unique(bed.data$chr))
    cl <- parallel::makeCluster(spec = nsubthread, type = cluster.type, outfile = "")
    doParallel::registerDoParallel(cl)
    k <- 0
    BAMcounts <- foreach::foreach(k = unique(as.character(bed.data$chr)), .inorder = TRUE, .export = c("tmsg", "BSg.obj", "bedbam2pup", "seq.int2")) %dopar% {
      
      tmsg(paste0(" Sequence : ", k))

      ### Computing counts for both BAMs
      bed.data.k <- bed.data[bed.data$chr == k,]
      
      ## Opening BAM connections
      openBAM <- Rsamtools::BamFileList(c(testBAM, refBAM))
      
      tmsg("  Computing TEST counts ...")
      testPUP <- bedbam2pup(BamFile = openBAM[[1]], bed.data = bed.data.k, scanBamFlag = scanBamFlag, pileupParam = pileupParam)
      tmsg("  Computing REF counts ...")
      refPUP <- bedbam2pup(BamFile = openBAM[[2]], bed.data = bed.data.k, scanBamFlag = scanBamFlag, pileupParam = pileupParam)
      
      ### Merge counts
      tmsg("  Merging TEST and REF ...")
      mPUP <- dplyr::inner_join(testPUP, refPUP, c("chr", "pos", "bin"))
      rm(testPUP, refPUP)
      gc()
      mPUP <- dplyr::group_by(mPUP, bin)
      
      ### Adding missing nucleotides
      # mPUP$nt <- as.factor(unlist(strsplit(as.character(BSgenome::getSeq(BSg.obj, names = GenomicRanges::makeGRangesFromDataFrame(mPUP, seqnames.field = "chr", start.field = "pos", end.field = "pos"))), split = "")))

      ### Binning
      tmsg("  Binning ...")
      # CN.table <- dplyr::summarize(mPUP, chr = unique(chr), start = as.integer(min(pos)), end = as.integer(max(pos)), tot_count.test = as.integer(round(mean(tot_count.x))), tot_count.ref = as.integer(round(mean(tot_count.y))), GCPC = as.integer(round(seqinr::GC(as.character(nt))*100)))
      CN.table <- dplyr::summarize(mPUP, chr = unique(chr), start = as.integer(min(pos)), end = as.integer(max(pos)), tot_count.test = as.integer(round(mean(tot_count.x))), tot_count.ref = as.integer(round(mean(tot_count.y))))
      mPUP <- dplyr::ungroup(mPUP)
      SNP.table <- mPUP[mPUP$alt_count.x > 0 | mPUP$alt_count.y > 0,]
      
      
      ### Cleaning
      rm(mPUP)
      gc()
      
      CN.table <- dplyr::arrange(CN.table, chr, start, end)
      # CN.table <- dplyr::select(CN.table, chr, start, end, bin, tot_count.test, tot_count.ref, GCPC)
      CN.table <- dplyr::select(CN.table, chr, start, end, bin, tot_count.test, tot_count.ref)
      colnames(SNP.table) <- c("chr", "pos", "bin", "tot_count.test", "alt_count.test", "tot_count.ref", "alt_count.ref")
      
      gc()
      return(list(CN = CN.table, SNP = SNP.table))
    }
    stopCluster(cl)
    return(BAMcounts)
  }
  
  COUNTS.all <- pileup.go(testBAM = testBAM, refBAM = refBAM, bed.data = bed.data, scanBamFlag = param.FLAG, pileupParam = param.PILEUP, nsubthread = nsubthread, cluster.type = cluster.type)
  
  CN.all <- foreach(k = seq_along(COUNTS.all), .combine = "rbind") %do% {
    k.tmp <- COUNTS.all[[k]]$CN
    COUNTS.all[[k]]$CN <- NULL
    gc()
    return(k.tmp)
  }
  
  SNP.all <- foreach(k = seq_along(COUNTS.all), .combine = "rbind") %do% {
    k.tmp <- COUNTS.all[[k]]$SNP
    COUNTS.all[[k]]$SNP <- NULL
    gc()
    return(k.tmp)
  }
  
  rm(COUNTS.all)
  gc()
  
  ## Building WESobj
  
  CN.all$bin <- as.integer(CN.all$bin)
  SNP.all$bin <- as.integer(SNP.all$bin)
  
  ### summaries (recoded function as R internal summary uses too much RAM !)
  my.summary <- function(myv = NULL) {
    vsum <- c(min(myv, na.rm = TRUE), quantile(myv, .25, na.rm = TRUE), median(myv, na.rm = TRUE), mean(myv, na.rm = TRUE), quantile(myv, .75, na.rm = TRUE), max(myv, na.rm = TRUE))
    names(vsum) <- c("min", "q25", "median", "mean", "q75", "max")
    return(vsum)
  }
  meta.w$BIN.tot.count.test.mean.summary <- my.summary(CN.all$tot_count.test[!is.na(CN.all$tot_count.test)])
  meta.w$BIN.tot.count.ref.mean.summary <- my.summary(CN.all$tot_count.ref[!is.na(CN.all$tot_count.ref)])
  meta.w$SNP.tot.count.test.summary <- my.summary(SNP.all$tot_count.test[!is.na(SNP.all$tot_count.test)])
  meta.w$SNP.tot.count.ref.summary <- my.summary(SNP.all$tot_count.ref[!is.na(SNP.all$tot_count.ref)])
  gc()
  
  ## Cleaning uncovered chr levels
  CN.all$chr <- droplevels(CN.all$chr)
  SNP.all$chr <- droplevels(SNP.all$chr)
  
  WESobj <- list(RD = CN.all, SNP = SNP.all, meta = list(basic = meta.b, WES = meta.w))
  rm(CN.all, SNP.all)
  gc()
  

  ## QC : Computing coverages
  tmsg("Computing coverages ...")
  gw.rd <- sum(WESobj$RD$end - WESobj$RD$start +1)
  gw.snp <- nrow(WESobj$SNP)
  rd.cov <- data.frame(cuts = c(1, 5, 10, 20, 30, 40, 50, 75, 100, 150, 200), stringsAsFactors = FALSE)
  rd.cov <- cbind(rd.cov, t(foreach::foreach(x = rd.cov$cuts, .combine = "cbind") %do% {
    test.rd.in <- WESobj$RD$tot_count.test >= x
    ref.rd.in <- WESobj$RD$tot_count.ref >= x
    test.snprd.in <- WESobj$SNP$tot_count.test >= x
    ref.snprd.in <- WESobj$SNP$tot_count.ref >= x
    test.cut.cov <- if(!any(test.rd.in)) NA else (sum(WESobj$RD$end[test.rd.in] - WESobj$RD$start[test.rd.in] +1)/gw.rd)
    ref.cut.cov <- if(!any(ref.rd.in)) NA else (sum(WESobj$RD$end[ref.rd.in] - WESobj$RD$start[ref.rd.in] +1)/gw.rd)
    test.snpcut.cov <- if(!any(test.snprd.in)) NA else (length(which(test.snprd.in))/gw.snp)
    ref.snpcut.cov <- if(!any(ref.snprd.in)) NA else (length(which(ref.snprd.in))/gw.snp)
    return(c(test.cut.cov, ref.cut.cov, test.snpcut.cov, ref.snpcut.cov))
  }))
  colnames(rd.cov) <- c("MinDepth", "TestBINCoverage", "RefBINCoverage", "TestBAFCoverage", "RefBAFCoverage")
  rm(test.snprd.in, ref.snprd.in, test.rd.in, ref.rd.in)
  
  if (write.data || plot) dir.create(paste0(out.dir, "/", samplename))
  if (write.data) write.table(rd.cov, file = paste0(out.dir, "/", samplename,  "/", samplename, '_WES_', genome, "_b", meta.w$bin.size, "_coverage.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
  
  ## QC : Plotting coverages
  if (plot) {
    ### Coverage plot
    png(paste0(out.dir, "/", samplename, "/", samplename, "_WES_", genome, "_b", meta.w$bin.size, "_coverage.png"), 800, 640)
    plot(rd.cov$MinDepth, rd.cov$TestBAFCoverage, type = "b", col = 2, lty = 3, pch = 20, main = paste0(WESobj$meta$basic$samplename, "\nCoverage Plot"), xlab = "Minimum depth", ylab = "Coverage", ylim = c(0,1), xaxp = c(0,200,10))
    abline(v = rd.cov$MinDepth, lty = 2, col = "grey75")
    abline(h = seq(0,1,.1), lty = 2, col = "grey75")
    lines(rd.cov$MinDepth, rd.cov$RefBAFCoverage, type = "b", col = 1, lty = 3, pch = 20)
    lines(rd.cov$MinDepth, rd.cov$TestBINCoverage, type = "b", col = 2)
    lines(rd.cov$MinDepth, rd.cov$RefBINCoverage, type = "b", col = 1)
    abline(h = .5, lty = 2)
    legend("topright", legend = c("Test BAF", "Ref BAF", "Test CN", "Ref CN"), inset = .02, col = c(2,1,2,1), lty = c(3,3,1,1), pch = c(20,20,1,1))
    dev.off()
    
    ### RD plots
    png(paste0(out.dir, "/", samplename, "/", samplename, "_WES_", genome, "_b", meta.w$bin.size, "_rawdepth.png"), 1600, 1050)
    par(mfrow = c(2,1))
    test.l10 <- log10(WESobj$RD$tot_count.test +1)
    plot(test.l10,  pch = ".", xaxs = "i", xlab = "Index", ylab = "log10(RD+1)", main = paste0(samplename, " TEST (", round(10^median(test.l10, na.rm = TRUE)), ")"))
    abline(h = median(test.l10, na.rm = TRUE), lty = 2, col = "cyan")
    lines(runmed(test.l10, 9999), col = 2)
    ref.l10 <- log10(WESobj$RD$tot_count.ref +1)
    plot(ref.l10,  pch = ".", xaxs = "i", xlab = "Index", ylab = "log10(RD+1)", main = paste0(samplename, " REF (", round(10^median(ref.l10, na.rm = TRUE)), ")"))
    abline(h = median(ref.l10, na.rm = TRUE), lty = 2, col = "cyan")
    lines(runmed(ref.l10, 9999), col = 2)
    dev.off()
  }
  rm(rd.cov)
  
  ## Saving
  if (write.data) {
    tmsg("Saving counts data ...")
    saveRDS(WESobj, file = paste0(out.dir, "/", samplename, "/", samplename, "_", genome, "_b", meta.w$bin.size, "_binned.RDS"), compress = "bzip2")
  }
  if (return.data) return(WESobj)
}

## Performs the binning of BAMs using a BINpack, batch mode
WES.Bin.Batch <- function(BAM.list.file = NULL, BINpack = NULL, nthread = 1, cluster.type = "PSOCK", ...) {

  if (!file.exists(BAM.list.file)) stop("Could not find BAM.list.file !", call. = FALSE)
  message("Reading and checking BAM.list.file ...")
  myBAMs <- read.table(file = BAM.list.file, header = TRUE, sep="\t", check.names = FALSE, as.is = TRUE)
  head.ok <- c("testBAM", "refBAM", "SampleName")
  head.chk <- all(colnames(BAM.list.file) == head.ok)
  if (!head.chk) {
    message("Invalid header in BAM.list.file !")
    message(paste0("EXPECTED : ", head.ok))
    message(paste0("FOUND : ", colnames(myBAMs)))
    stop("Invalid header.", call. = FALSE)
  }

  tb.chk <- file.exists(myBAMs$testBAM)
  rb.chk <- file.exists(myBAMs$refBAM)

  if (!all(tb.chk) || !all(tb.chk)) {
    message("Some BAM files from the BAM.list.file could not be found (wrong path or filename ?) !")
    message("Missing testBAM file(s) :")
    message(myBAMs$testBAM[which(!tb.chk)])
    message("Missing refBAM file(s) :")
    message(myBAMs$refBAM[which(!rb.chk)])
    stop("Missing BAM file(s).", call. = FALSE)
  }
  sn.chk <- duplicated(myBAMs$SampleName)
  if (any(sn.chk)) {
    message("BAM.list.file contains duplicated samplenames !")
    message(myBAMs$SampleName[which(duplicated(myBAMs$SampleName))])
    stop("Duplicated samplenames.", call. = FALSE)
  }
  if(any(myBAMs$testBAM == myBAMs$refBAM)) {
    message("Some testBAM and refBAM are identical for at least one sample !")
    stop("Identical BAM files for Test and Ref.", call. = FALSE)
  }

  ## Adjusting cores/threads
  message("Adjusting number of cores if needed ...")
  if (is.null(nthread)) nthread <- parallel::detectCores(logical = TRUE) -1
  if (nrow(myBAMs) < nthread) nthread <- nrow(myBAMs)

  message("Running EaCoN.WES.Bin() in batch mode ...")
  message(paste0("Found ", nrow(myBAMs), " samples to process ..."))
  current.bitmapType <- getOption("bitmapType")
  `%dopar%` <- foreach::"%dopar%"
  cl <- parallel::makeCluster(spec = nthread, type = cluster.type, outfile = "")
  doParallel::registerDoParallel(cl)
  eacon.batchres <- foreach::foreach(r = seq_len(nrow(myBAMs)), .inorder = TRUE, .errorhandling = "stop", .export = c("EaCoN.set.bitmapType", "WES.Bin", "tmsg")) %dopar% {
    EaCoN.set.bitmapType(type = current.bitmapType)
    WES.Bin(testBAM = myBAMs$testBAM[r], refBAM = myBAMs$refBAM[r], BINpack = BINpack, samplename = myBAMs$SampleName[r], cluster.type = cluster.type, ...)
  }
  parallel::stopCluster(cl)
}

## Performs the normalization of WES L2R and BAF signals
WES.Normalize <- function(data = NULL, BINpack = NULL, gc.renorm = TRUE, wave.renorm = FALSE, wave.rda = NULL, RD.tot.min = 20, RD.alt.min = 3, BAF.hetmin = .33, sex.chr = c("chrX", "chrY"), TumorBoost = FALSE, out.dir = getwd(), return.data = FALSE, write.data = TRUE, plot = TRUE) {

  # setwd("/mnt/data_cigogne/job/PUBLI_EaCoN/TCGA/ANALYSES/EaCoN_0.3.0_beta2/WES/TCGA-A7-A0CE-01A_vs_10A")
  # data <- readRDS("Sample_PHEO_AG_HS_048_DNA_hg38_b50_binned.RDS")
  # BINpack <- "V4-UTRs.hg38.fragment_targets_minimal_sorted_longChr_hg38_b50.GC.rda"
  # gc.renorm <- TRUE
  # wave.renorm <- FALSE
  # # wave.rda <- "/mnt/data_cigogne/job/PUBLI_EaCoN/TCGA/RESOURCES/SureSelect_ClinicalResearchExome.padded_GRCh37-lite_merged_sorted_hs37d5_b50.Wave.rda"
  # RD.tot.min = 20
  # RD.alt.min = 3
  # TumorBoost = FALSE
  # # sex.chr <- c("X", "Y")
  # sex.chr <- c("chrX", "chrY")
  # out.dir = getwd()
  # return.data = FALSE
  # write.data = TRUE
  # plot = TRUE
  # BAF.hetmin <- .33
  # source("/home/job/git_gustaveroussy/EaCoN/R/mini_functions.R")
  # source("/home/job/git_gustaveroussy/EaCoN/R/renorm_functions.R")
  # require(foreach)

  ### AJOUTER UN CONTROLE DU RDS (pour que ce ne soit pas un _processed.RDS donné en entrée!)
  
  
  ## CHECKS
  if (!is.list(data)) stop(tmsg("data should be a list !"), call. = FALSE)
  if (is.null(BINpack)) stop(tmsg("A BINpack file is required !"), call. = FALSE)
  if (!file.exists(BINpack)) stop(tmsg("Could not find the BINpack file !"), call. = FALSE)
  if (wave.renorm) { if (!is.null(wave.rda)) { if (!file.exists(wave.rda)) stop(tmsg(paste0("Could not find wave.rda file ", wave.rda)), call. = FALSE) } }
  if (RD.tot.min < 0) stop(tmsg("RD.tot.min must be >= 0 !"), call. = FALSE)
  if (RD.alt.min <= 0) stop(tmsg("RD.alt.min must be > 0 !"), call. = FALSE)

  ## TAGS
  data$meta$WES$TumorBoost <- as.character(TumorBoost)
  data$meta$WES$RD.tot.min <- RD.tot.min
  data$meta$WES$GC.renorm <- as.character(gc.renorm)
  data$meta$WES$Wave.renorm <- as.character(wave.renorm)
  samplename <- data$meta$basic$samplename

  ## Loading BINpack
  load(BINpack)
  gc()

  genome.pkg <- renorm.data$info$value[renorm.data$info$key == "genome-package"]
  if (!genome.pkg %in% BSgenome::installed.genomes()) {
    if (genome.pkg %in% BSgenome::available.genomes()) {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " available but not installed. Please install it !")), call. = FALSE)
    } else {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " not available in valid BSgenomes and not installed ... Please check your genome name or install your custom BSgenome !")), call. = FALSE)
    }
  }

  ### Loading genome
  tmsg(paste0("Loading ", genome.pkg, " ..."))
  suppressPackageStartupMessages(require(genome.pkg, character.only = TRUE))
  # requireNamespace(genome.pkg, quietly = TRUE)
  BSg.obj <- getExportedValue(genome.pkg, genome.pkg)
  # genome <- BSgenome::providerVersion(BSg.obj)
  genome <- metadata(BSg.obj)$genome
  cs <- chromobjector(BSg.obj)

  ## BAF HANDLING
  tmsg("Processing BAF ...")
  
  BAF.adder <- function(Bdata = NULL, Bvalues = NULL, newname = NULL, type = "test") {
    Bdata[[paste0("BAF.", type)]] <- Bdata[[newname]] <- Bvalues
    Bdata[[paste0("mBAF.", type)]] <- BAF2mBAF(Bdata[[paste0("BAF.", type)]])
    return(Bdata)
  }

  ### BAF : Filtering (low depth)
  rd.ori <- nrow(data$SNP)
  #### 1) Filtering for low ref or test depth
  RDlow <- data$SNP$tot_count.test < RD.tot.min | data$SNP$tot_count.ref < RD.tot.min
  if (length(which(RDlow)) == nrow(data$SNP)) stop(tmsg("All SNP positions were discarded for their low read count ! You may consider lowering the BAF.tot.min value."), call. = FALSE)
  tmsg(paste0("Removed ", length(which(RDlow)), " (", round(length(which(RDlow)) / rd.ori * 100, digits = 2), "%) SNP positions with low depth (<", RD.tot.min, ")"))
  data$SNP <- data$SNP[!RDlow,]
  #### 2) Filtering for low alt test depth
  BRDlow <- data$SNP$alt_count.test < RD.alt.min
  if (length(which(BRDlow)) == nrow(data$SNP)) stop(tmsg("All SNP positions were discarded for their low alternative allele count ! You may consider lowering the RD.alt.min value."), call. = FALSE)
  tmsg(paste0("Removed ", length(which(BRDlow)), " (", round(length(which(BRDlow)) / rd.ori * 100, digits = 2), "%) SNP positions with low alt RD (<", RD.alt.min, ")"))
  data$SNP <- data$SNP[!BRDlow,]
  gc()

  ### Computing BAF
  data$SNP$BAF.test.ori <- data$SNP$alt_count.test / data$SNP$tot_count.test
  data$SNP$BAF.ref.ori <- data$SNP$alt_count.ref / data$SNP$tot_count.ref
  odd.idx <- which(data$SNP$pos %% 2 == 1)
  data$SNP$BAF.test.ori[odd.idx] <- -data$SNP$BAF.test.ori[odd.idx] +1L
  data$SNP$BAF.ref.ori[odd.idx] <- -data$SNP$BAF.ref.ori[odd.idx] +1L

  data$SNP <- BAF.adder(Bdata = data$SNP, Bvalues = data$SNP$BAF.test.ori, newname = "BAF.test.ori")
  data$SNP <- BAF.adder(Bdata = data$SNP, Bvalues = data$SNP$BAF.ref.ori, newname = "BAF.ref.ori", type = "ref")
  
  ###### Computing LOR (and variance)
  rcmat <- round(cbind(data$SNP$BAF.test.ori*data$SNP$tot_count.test, (1-data$SNP$BAF.test.ori)*data$SNP$tot_count.test))
  data$SNP$LOR <- log(rcmat[,1]+1/6) - log(rcmat[,2]+1/6)
  data$SNP$LORvar <- 1/(rcmat[,1]+1/6) + 1/(rcmat[,2]+1/6)
  rm(rcmat)
  gc()
  
  ### BAF : TumorBoost
  if (TumorBoost) {
    message(tmsg("Applying TumorBoost BAF normalization ..."))
    # data$SNP$BAF.test <- data$SNP$BAF.test.TB <- as.numeric(aroma.light::normalizeTumorBoost(data$SNP$BAF.test, data$SNP$BAF.ref, flavor = "v4", preserveScale = FALSE))
    BTB <- as.numeric(aroma.light::normalizeTumorBoost(data$SNP$BAF.test, data$SNP$BAF.ref, flavor = "v4", preserveScale = FALSE))
    data$SNP <- BAF.adder(Bdata = data$SNP, Bvalues = BTB, newname = "BAF.test.TB")
  }

  ### Getting heterozygous probes from Ref
  Ref.hetero <- data$SNP$mBAF.ref >= BAF.hetmin
  if (!any(Ref.hetero)) stop(tmsg("All SNP positions were tagged as homozygous in Ref : there may be a problem with your reference BAM ploidy !"), call. = FALSE)

  ### Keeping hetero positions
  data$SNP <- data$SNP[Ref.hetero,]
  ### Removing additional values per bin
  data$SNP <- data$SNP[!duplicated(data$SNP$bin),]
  gc()
  
  ### BAF filtering
  # smoB <- round(nrow(data$SNP) / 3300)
  # if(smoB%%2 == 0) smoB <- smoB+1
  # mBAF.rm <- runmed(data$SNP$mBAF.test, smoB)
  # mBAF.diff <- abs(data$SNP$mBAF.test - mBAF.rm)
  # Bfiltered <- mBAF.diff < quantile(mBAF.diff, BAF.filter)
  # data$SNP <- data$SNP[Bfiltered,]

  ### Adding LOR
  data$SNP$LOR.test <- log(data$SNP$alt_count.test / (data$SNP$tot_count.test - data$SNP$alt_count.test))

  ## L2R
  tmsg("Processing RD bins ...")

  #### Computing L2R
  data$RD$L2R <- data$RD$L2R.ori <- log2((data$RD$tot_count.test+1) / (data$RD$tot_count.ref+1))

  ### L2R : Filtering
  #### 1) low depth
  rd.ori <- nrow(data$RD)
  RDlow <- data$RD$tot_count.test < RD.tot.min | data$RD$tot_count.ref < RD.tot.min
  data$meta$WES$Imputed.lowdepth.bins <- length(which(RDlow))
  if (length(which(RDlow)) == nrow(data$RD)) stop(tmsg("All RD bins were flagged for their low read count ! You may consider lowering the BAF.tot.min value."), call. = FALSE)
  tmsg(paste0("Flagged ", length(which(RDlow)), " (", round(length(which(RDlow)) / rd.ori * 100, digits = 2), "%) RD bins with low depth (<", RD.tot.min, ")"))

  #### 2) GC% outliers
  GCOL <- renorm.data$tracks[,5] < 200 | renorm.data$tracks[,5] > 800
  data$meta$WES$Imputed.GCoutlier.bins <- length(which(GCOL))
  if (length(which(GCOL)) == nrow(data$RD)) stop(tmsg("All RD bins were flagged as GC% outliers  ! There may be something wrong with your reference genome and/or capture BED."), call. = FALSE)
  tmsg(paste0("Flagged ", length(which(GCOL)), " (", round(length(which(GCOL)) / rd.ori * 100, digits = 2), "%) RD bins as GC% outliers."))

  ### Pooling and imputing
  FLAGS <- RDlow + GCOL > 0

  if (any(FLAGS)) {
    tmsg(paste0(" Imputed ", length(which(FLAGS)), " (", round(length(which(FLAGS))/rd.ori*100, digits = 2), "%) L2R bins."))
    l2r.tmp <- data$RD$L2R
    l2r.tmp[FLAGS] <- NA
    data$RD$L2R <- data$RD$L2R.imp <- approxfun(seq_along(l2r.tmp), l2r.tmp, rule = 2)(seq_along(l2r.tmp))
  } else data$RD$L2R.imp <- data$RD$L2R
  gc()

  ## L2R : Normalization
  smo <- round(nrow(data$RD) / 550)
  if(smo%%2 == 0) smo <- smo+1

  ### Wave
  if (wave.renorm) {
    tmsg("Wave normalization ...")

    l2r2norm <- data.frame(ProbeSetName = data$RD$bin, chr = as.character(data$RD$chr), pos = data$RD$start, L2R = data$RD$L2R)
    # rownames(l2r2norm) <- seq_len(nrow(l2r2norm))
    ren.res <- renorm.go(input.data = l2r2norm, renorm.rda = wave.rda, track.type = "Wave", smo = smo, arraytype = data$meta$basic$type, genome = genome)

    fitted.l2r <- ren.res$renorm$l2r$l2r

    GCF <- is.na(fitted.l2r)
    if (any(GCF)) {
      l2r.tmp <- fitted.l2r
      l2r.tmp[GCF] <- NA
      fitted.l2r <- approxfun(seq_along(l2r.tmp), l2r.tmp, rule = 2)(seq_along(l2r.tmp))
    }

    if(is.null(ren.res$renorm$pos)) {
      # meta.b <- setmeta("gc.renorm", "None", meta.b)
      data$meta$WES <- setmeta("wave.renorm", "None", data$meta$WES)
      tmsg(" No positive fit.")
    } else {
      ## Tweaking sex chromosomes
      sex.idx <- data$RD$chr %in% sex.chr
      auto.ori.med <- median(data$RD$L2R[!sex.idx], na.rm = TRUE)
      auto.rn.med <- median(fitted.l2r[!sex.idx], na.rm = TRUE)
      if (any(sex.idx)) {
        for (k in sex.chr) {
          k.idx <- data$RD$chr == k
          if (any(k.idx)) {
            k.ori.diffmed <- median(data$RD$L2R.ori[k.idx], na.rm = TRUE) - auto.ori.med
            k.rn.diffmed <- median(fitted.l2r[k.idx], na.rm = TRUE) - auto.rn.med
            fitted.l2r[k.idx] <- fitted.l2r[k.idx] - k.rn.diffmed + k.ori.diffmed
          }
        }
      }
      # meta.b <- setmeta("wave.renorm", paste0(ren.res$renorm$pos, collapse = ","), meta.b)
      data$meta$WES <- setmeta("wave.renorm", paste0(ren.res$renorm$pos, collapse = ","), data$meta$WES)
    }
    rm(ren.res)

    data$RD$L2R.WAVE <- data$RD$L2R <- fitted.l2r - median(fitted.l2r, na.rm = TRUE)
  } else {
    # meta.b <- setmeta("wave.renorm", "FALSE", meta.b)
    data$meta$WES <- setmeta("wave.renorm", "FALSE", data$meta$WES)
  }

  #### GC%
  # message("GC% normalization ...")
  # data$RD$L2R.GC <- data$RD$L2R <- limma::loessFit(x = data$RD$GCPC, y = data$RD$L2R)$residuals
  # if (any(is.na(data$RD$L2R))) {
  #   l2r.tmp <- data$RD$L2R
  #   l2r.tmp[is.na(data$RD$L2R)] <- NA
  #   data$RD$L2R <- data$RD$L2R.GC <- approxfun(seq_along(l2r.tmp), l2r.tmp, rule = 2)(seq_along(l2r.tmp))
  # }



  if (gc.renorm) {
    tmsg("GC% normalization ...")

    l2r2norm <- data.frame(ProbeSetName = data$RD$bin, chr = as.character(data$RD$chr), pos = data$RD$start, L2R = data$RD$L2R)
    # rownames(l2r2norm) <- seq_len(nrow(l2r2norm))
    ren.res <- renorm.go(input.data = l2r2norm, renorm.rda = BINpack, track.type = "GC", smo = smo, arraytype = data$meta$basic$type, genome = genome)

    fitted.l2r <- ren.res$renorm$l2r$l2r

    GCF <- is.na(fitted.l2r)
    if (any(GCF)) {
      l2r.tmp <- fitted.l2r
      l2r.tmp[GCF] <- NA
      fitted.l2r <- approxfun(seq_along(l2r.tmp), l2r.tmp, rule = 2)(seq_along(l2r.tmp))
    }

    if(is.null(ren.res$renorm$pos)) {
      # meta.b <- setmeta("gc.renorm", "None", meta.b)
      data$meta$eacon <- setmeta("gc.renorm", "None", data$meta$eacon)
      tmsg(" No positive fit.")
    } else {
      ## Tweaking sex chromosomes
      sex.idx <- data$RD$chr %in% sex.chr
      auto.ori.med <- median(data$RD$L2R.ori[!sex.idx], na.rm = TRUE)
      auto.rn.med <- median(fitted.l2r[!sex.idx], na.rm = TRUE)
      if (any(sex.idx)) {
        for (k in sex.chr) {
          k.idx <- data$RD$chr == k
          if (any(k.idx)) {
            # k.ori.diffmed <- median(data$RD$L2R.ori[k.idx], na.rm = TRUE) - auto.ori.med
            k.ori.diffmed <- median(data$RD$L2R.ori[k.idx], na.rm = TRUE) - auto.ori.med
            k.rn.diffmed <- median(fitted.l2r[k.idx], na.rm = TRUE) - auto.rn.med
            fitted.l2r[k.idx] <- fitted.l2r[k.idx] - k.rn.diffmed + k.ori.diffmed
          }
        }
      }
      # meta.b <- setmeta("gc.renorm", paste0(ren.res$renorm$pos, collapse = ","), meta.b)
      data$meta$eacon <- setmeta("gc.renorm", paste0(ren.res$renorm$pos, collapse = ","), data$meta$eacon)
    }
    rm(ren.res)

    data$RD$L2R.GC <- data$RD$L2R <- fitted.l2r - median(fitted.l2r, na.rm = TRUE)
  } else {
    # meta.b <- setmeta("gc.renorm", "FALSE", meta.b)
    data$meta$eacon <- setmeta("gc.renorm", "FALSE", data$meta$eacon)
  }

  ## Merging
  data$RD$BAF <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "BAF.test"))], by = c("chr", "bin"))$BAF.test
  # data$RD$LOR <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "LOR.test"))], by = c("chr", "bin"))$LOR.test
  data$RD$LOR <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "LOR"))], by = c("chr", "bin"))$LOR
  data$RD$LORvar <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "LORvar"))], by = c("chr", "bin"))$LORvar
  data$RD$RD.test <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "tot_count.test"))], by = c("chr", "bin"))$tot_count.test
  data$RD$RD.ref <- dplyr::left_join(data$RD[, c(1:4,9)], data$SNP[, c(1,3,which(colnames(data$SNP) == "tot_count.ref"))], by = c("chr", "bin"))$tot_count.ref
  
  ## Building ASCAT object
  tmsg("Building normalized object ...")

  my.ch <- sapply(unique(data$RD$chr), function(x) { which(data$RD$chr == x) })
  
  my.ascat.obj <- list(
    data = list(
      Tumor_LogR.ori = data.frame(sample = data$RD$L2R.ori, row.names = data$RD$bin),
      Tumor_LogR = data.frame(sample = data$RD$L2R, row.names = data$RD$bin),
      Tumor_BAF = data.frame(sample = data$RD$BAF, row.names = data$RD$bin),
      Tumor_LogR_segmented = NULL,
      Tumor_BAF_segmented = NULL,
      Germline_LogR = NULL,
      Germline_BAF = NULL,
      SNPpos = data.frame(chrs = data$RD$chr, pos = round((data$RD$start + data$RD$end)/2)),
      ch = my.ch,
      chr = my.ch,
      chrs = levels(data$RD$chr),
      samples = samplename,
      gender = "NA",
      sexchromosomes = sex.chr,
      failedarrays = NULL,
      additional = data$RD[,colnames(data$RD) %in% c("RD.test", "RD.ref", "LOR", "LORvar")]
    ),
    meta = data$meta,
    germline = list(germlinegenotypes = matrix(is.na(data$RD$BAF), ncol = 1, dimnames = list(data$RD$bin, samplename)), failedarrays = NULL)
  )
  colnames(my.ascat.obj$data$Tumor_LogR) <- colnames(my.ascat.obj$data$Tumor_LogR.ori) <- colnames(my.ascat.obj$data$Tumor_BAF) <- samplename
  # rm(my.ch, data)
  gc()

  # plot(ares$Tumor_LogR[,1], pch = ".", cex = 3, xaxs = "i", ylim = c(-2,2))
  # points(ares$Tumor_LogR_segmented, pch = ".", cex = 3, col = 2)
  # plot(ares$Tumor_BAF[!is.na(ares$Tumor_BAF),1], pch = ".", xaxs = "i", cex = 3)
  # points(ares$Tumor_BAF_segmented[[1]], pch = ".", cex = 3, col = 2)
  # points(1 - ares$Tumor_BAF_segmented[[1]], pch = ".", cex = 3, col = 2)
  #
  ## Saving data
  if (write.data) {
    tmsg("Saving normalized data ...")
    saveRDS(my.ascat.obj, paste0(out.dir, "/", samplename, "_", data$meta$basic$genome, "_b", data$meta$WES$bin.size, "_processed.RDS"), compress = "bzip2")
  }

  ## Plot
  tmsg("Plotting ...")
  if (plot) {
    l2r <- my.ascat.obj$data$Tumor_LogR[,1]
    l2r.rm <- runmed(l2r, smo)
    l2r.dif <- diff(l2r)
    l2r.mad <- median(abs(l2r.dif[l2r.dif != 0]))
    l2r.rm.dif <- diff(l2r.rm)
    l2r.ssad <- sum(abs(l2r.rm.dif[l2r.rm.dif != 0]))
    
    l2r.ori <- my.ascat.obj$data$Tumor_LogR.ori[,1]
    l2r.ori.rm <- runmed(l2r.ori, smo)
    l2r.ori.dif <- diff(l2r.ori)
    l2r.ori.mad <- median(abs(l2r.ori.dif[l2r.ori.dif != 0]))
    l2r.ori.rm.dif <- diff(l2r.ori.rm)
    l2r.ori.ssad <- sum(abs(l2r.ori.rm.dif[l2r.ori.rm.dif != 0]))
    
    l2r.genopos <- my.ascat.obj$data$SNPpos$pos + cs$chromosomes$chr.length.toadd[my.ascat.obj$data$SNPpos$chrs]
    
    l2r <- l2r - median(l2r, na.rm = TRUE)
    l2r.ori <- l2r.ori - median(l2r.ori, na.rm = TRUE)
    kend <- l2r.genopos[vapply(unique(my.ascat.obj$data$SNPpos$chr), function(k) { max(which(my.ascat.obj$data$SNPpos$chrs == k))}, 1)]
    
    png(paste0(out.dir, "/", samplename, "_WES_", data$meta$basic$genome, "_rawplot.png"), 1600, 1050)
    par(mfrow = c(3,1))
    plot(l2r.genopos, l2r.ori, pch = ".", cex = 3, col = "grey70", xaxs = "i", yaxs = "i", ylim = c(-2,2), main = paste0(samplename, " WES (", data$meta$basic$manufacturer, ") raw L2R profile (median-centered)\nMAD = ", round(l2r.ori.mad, digits = 2), " ; SSAD = ", round(l2r.ori.ssad, digits = 2)), xlab = "Genomic position", ylab = "L2R")
    lines(l2r.genopos, l2r.ori.rm, col = 1)
    abline(v = kend, col = 4, lty = 3, lwd = 2)
    abline(h = 0, col = 2, lty = 2, lwd = 2)
    plot(l2r.genopos, l2r, pch = ".", cex = 3, col = "grey70", xaxs = "i", yaxs = "i", ylim = c(-2,2), main = paste0(samplename, " WES (", data$meta$basic$manufacturer, ") normalized L2R profile (median-centered)\nMAD = ", round(l2r.mad, digits = 2), " ; SSAD = ", round(l2r.ssad, digits = 2)), xlab = "Genomic position", ylab = "L2R")
    lines(l2r.genopos, l2r.rm, col = 1)
    abline(v = kend, col = 4, lty = 3, lwd = 2)
    abline(h = 0, col = 2, lty = 2, lwd = 2)
    plot(l2r.genopos, my.ascat.obj$data$Tumor_BAF[,1], pch = ".", cex = 3, col = "grey75", xaxs = "i", yaxs = "i", ylim = c(0,1), main = paste0(samplename, " WES (", data$meta$basic$manufacturer, ")", if(TumorBoost) " TumorBoost-normalized", " BAF profile"), xlab = "Genomic position", ylab = "BAF")
    abline(v = kend, col = 4, lty = 3, lwd = 2)
    abline(h = .5, col = 2, lty = 2, lwd = 2)
    dev.off()
  }

  tmsg("Done.")
  if(return.data) return(my.ascat.obj)
}

## Runs WES.Normalize using a RDS filename
WES.Normalize.ff <- function(BIN.RDS.file = NULL, ...) {
  
  ## CHECKS
  if (is.null(BIN.RDS.file)) stop(tmsg("An RDS file from EaCoN::EaCoN.WES.Bin is required !"), call. = FALSE)
  if (!file.exists(BIN.RDS.file)) stop(tmsg(paste0("Could not find ", BIN.RDS.file, " .")), call. = FALSE)

  tmsg("Loading binned WES data ...")
  my.data <- readRDS(BIN.RDS.file)
  WES.Normalize(data = my.data, out.dir = dirname(BIN.RDS.file), ...)
}

## Runs WES.Normalize.ff, batch mode
WES.Normalize.ff.Batch <- function(BIN.RDS.files = list.files(path = getwd(), pattern = "_binned.RDS$", all.files = FALSE, full.names = TRUE, recursive = TRUE, ignore.case = FALSE, include.dirs = FALSE), nthread = 1, cluster.type = "PSOCK", ...) {
  if (length(BIN.RDS.files) == 0) stop("No file found to process !", call. = FALSE)
  message("Running EaCoN.WES.Normalize.ff() in batch mode ...")
  message(paste0("Found ", length(BIN.RDS.files), " samples to process ..."))
  current.bitmapType <- getOption("bitmapType")
  `%dopar%` <- foreach::"%dopar%"
  cl <- parallel::makeCluster(spec = nthread, type = cluster.type, outfile = "")
  doParallel::registerDoParallel(cl)
  eacon.batchres <- foreach::foreach(r = seq_along(BIN.RDS.files), .inorder = TRUE, .errorhandling = "stop") %dopar% {
    EaCoN.set.bitmapType(type = current.bitmapType)
    WES.Normalize.ff(BIN.RDS.file = BIN.RDS.files[r], ...)
  }
  parallel::stopCluster(cl)
}

bedBinner <- function(bed = NULL, bin.size = 50, nthread = 1) {
  
  bin.size <- as.integer(bin.size)
  cl <- parallel::makeCluster(spec = nthread, type = "PSOCK", outfile = "")
  suppressPackageStartupMessages(require(foreach))
  doParallel::registerDoParallel(cl)
  k <- 0
  bed.binned <- foreach::foreach(k = unique(bed$chr), .combine = "rbind", .export = "tmsg") %dopar% {
    tmsg(k)
    
    bedk <- bed[bed$chr == k,]
    
    b <- 0
    bbk <- foreach::foreach(b = seq_len(nrow(bedk)), .combine = "rbind", .export = "bin.size") %do% {
      
      ### Smaller exon
      exon.length <- (bedk$end[b] - bedk$start[b] + 1L)
      if (exon.length <= bin.size) return(bedk[b,])
      mod.rest <- exon.length %% bin.size
      mod.count <- as.integer((exon.length - mod.rest) / bin.size)
      
      bin.starts <- bedk$start[b] + ((seq_len(mod.count)-1L) * bin.size)
      bin.ends <- bin.starts + bin.size - 1L
      
      ## Non-Round count
      if (mod.rest > 0L) {
        ## Enough for a new bin
        if (mod.rest >= (bin.size / 2L)) {
          bin.starts <- c(bin.starts, bin.starts[mod.count]+bin.size)
          bin.ends <- c(bin.ends, bedk$end[b])
          mod.count <- mod.count+1L
        } else { ## Dispatch to inside bins
          if (mod.rest >= mod.count) {
            mod.rest2 <- mod.rest %% mod.count
            mod.count2 <- as.integer((mod.rest - mod.rest2) / mod.count)
            
            bin.starts <- bedk$start[b] + ((seq_len(mod.count)-1L) * (bin.size + mod.count2))
            bin.ends <- bin.starts + (bin.size + mod.count2) - 1L
            bin.ends[mod.count] <- bin.ends[mod.count] + mod.rest2
          } else {
            # bin.ends[mod.count] <- bin.ends[mod.count] + mod.rest
            bin.ends[mod.count] <- bedk$end[b]
          }
        }
      }
      # if(length(bin.starts) != length(bin.starts))
      chrs = rep(bedk$chr[b], mod.count)
      return(data.frame(chr = chrs, start = bin.starts, end = bin.ends, stringsAsFactors = FALSE))
    }
    
    return(bbk)
  }
  parallel::stopCluster(cl)
  return(bed.binned)
}

## Compute letter composition of nucleotidic sequences from a (chr, start, end) dataframe, with possible extension.
loc.nt.count.hs <- function(loc.df = NULL, genome.pkg = "BSgenome.Hsapiens.UCSC.hg19", extend = 0, blocksize = 1E+04, nthread = 5) {
  if (is.null(loc.df)) stop("loc.df is required !", call. = FALSE)
  
  if (extend < 0) stop("extend should be >= 0", call. = FALSE)
  if (blocksize <= 0) stop("blocksize should be > 0", call. = FALSE)
  if (!all(is.character(loc.df$chr) | is.factor(loc.df$chr))) stop("chr should be character !", call. = FALSE)
  if (!all(is.numeric(loc.df$start))) stop("start should be numeric !", call. = FALSE)
  if (!all(is.numeric(loc.df$end))) stop("end should be numeric !", call. = FALSE)

  if (!genome.pkg %in% BSgenome::installed.genomes()) {
    if (genome.pkg %in% BSgenome::available.genomes()) {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " available but not installed. Please install it !")), call. = FALSE)
    } else {
      stop(tmsg(paste0("BSgenome ", genome.pkg, " not available in valid BSgenomes and not installed ... Please check your genome name or install your custom BSgenome !")), call. = FALSE)
    }
  }

  print(paste0("Loading ", genome.pkg, " sequence ..."))
  # requireNamespace(genome.pkg, quietly = TRUE)
  suppressPackageStartupMessages(require(genome.pkg, character.only = TRUE))
  # require(genome.pkg, character.only = TRUE)
  BSg.obj <- getExportedValue(genome.pkg, genome.pkg)
  # genome <- BSgenome::providerVersion(BSg.obj)
  genome <- metadata(BSg.obj)$genome
  cs <- chromobjector(BSg.obj)


  print("Removing replicated locations ...")
  idz <- paste0(loc.df$chr, ":", loc.df$start, "-", loc.df$end)
  loc.df <- loc.df[!duplicated(idz),]

  print("Removing non-canonical sequences ...")
  loc.df <- loc.df[loc.df$chr %in% seqnames(BSg.obj),]

  print("Ordering data ...")
  loc.df <- loc.df[order(unlist(cs$chrom2chr[loc.df$chr]), loc.df$start, loc.df$ProbeSetName),]


  
  myGR.ex <- suppressPackageStartupMessages(GenomicRanges::makeGRangesFromDataFrame(loc.df, seqinfo = seqinfo(BSg.obj)))
  if (extend > 0) myGR.ex <- GenomicRanges::trim(myGR.ex + extend)

  instep <- as.numeric(cut(seq_along(myGR.ex), seq.int(0, length(myGR.ex) + blocksize, blocksize)))

  print("Computing base composition ...")

  print("Starting cluster ...")
  
  if (length(unique(instep)) < nthread) nthread <- length(unique(instep))
  cl <- parallel::makeCluster(spec = nthread, type = "PSOCK", outfile = "")
  doParallel::registerDoParallel(cl)
  requireNamespace("foreach", quietly = TRUE)
  `%dopar%` <- foreach::"%dopar%"
  xcounts <- foreach::foreach(x = unique(instep), .combine = "rbind", .packages = c("Biostrings", "BSgenome"), .export = "getSeq") %dopar% {
    return(Biostrings::alphabetFrequency(BSgenome::getSeq(BSg.obj, myGR.ex[which(instep == x)]), baseOnly = TRUE))
  }
  print("Stopping cluster ...")
  parallel::stopCluster(cl)

  out.df <- cbind(loc.df, xcounts)

  return(out.df)
}

loc.nt.gcc.hs <- function(loc.counts = NULL) {
  gcc <- (loc.counts$C + loc.counts$G) / (loc.counts$A + loc.counts$C + loc.counts$G + loc.counts$T)
  return(data.frame(loc.counts, GC = gcc, stringsAsFactors = FALSE))
}

## Compute GC on a (chr, start, end) dataframe using multiple extend values
loc.nt.gcc.hs.multi <- function(loc.df = NULL, extend.multi = c(50, 100, 200, 400, 800, 1600, 3200, 6400), ...) {
  # require(foreach)
  requireNamespace("foreach", quietly = TRUE)
  `%do%` <- foreach::"%do%"
  gc.list <- foreach::foreach(nt.add = extend.multi) %do% {
    print(paste0("Computing GC +", nt.add, " ..."))
    adb.counts <- loc.nt.count.hs(loc.df = loc.df, extend = nt.add, ...)
    adb.gc <- loc.nt.gcc.hs(loc.counts = adb.counts)
    return(adb.gc)
  }
  base.df <- gc.list[[1]][,1:4]
  gc.df <- foreach::foreach(nt.add = seq_along(gc.list), .combine = "cbind") %do% { return(as.integer(round(gc.list[[nt.add]][["GC"]] * 1000))) }
  colnames(gc.df) <- paste0("GC", extend.multi)
  return(data.frame(base.df, gc.df, stringsAsFactors = FALSE))
}

genome.build.finder <- function(BAM.header = NULL, valid.genomes = NULL) {
  BAM.header <- unlist(BAM.header)
  query <- paste0("(", paste0(valid.genomes, collapse = "|"), ")")
  stvh.grep <- grep(query, unlist(BAM.header))
  if (length(stvh.grep) == 0) stop(tmsg("Could not automatically determine genome build ! Please specify it !"), call. = FALSE)

  stvh.regexec <- unique(vapply(stvh.grep, function(x) {
    rc.res <- regexec(query, BAM.header[x])[[1]]
    return(as.character(substr(BAM.header[x], start = rc.res[1], stop = rc.res[1]+attr(rc.res, "match.length")[1]-1)))
  }, "a"))

  ok.genome <- unique(stvh.regexec[stvh.regexec %in% valid.genomes])
  if (length(ok.genome) == 0) stop(tmsg(paste0("Identified a putative genome build (", ok.genome, "), but not a supported one !")), call. = FALSE)
  if (length(ok.genome) >= 2) stop(tmsg(paste0("Identified more than one putative genome build (", ok.genome, ") !")), call. = FALSE)
  return(ok.genome)
}
gustaveroussy/EaCoN documentation built on Oct. 20, 2021, 2:41 a.m.