R/FolderAnalysis_functions.R

Defines functions GetGeneRegionsBand GetGeneRegionsStatusSummary KaryotypeAberrationPlot PoolsPlots SampleReadCountsByPool SampleReadCountsPools Strict CNVPanelizer CNVPanelizerFromReadCounts CNVPanelizerFromReadCountsHELPER PlotBootstrapDistributions StatusHeatmap revalueDF CollectColumnFromAllReportTables SelectReferenceSetFromReadCounts SelectReferenceSet SelectReferenceSetByKmeans SelectReferenceSetByInterquartileRange SelectReferenceSetByPercentil IterateAmplNum NumberOfUniqueAmplicons countAmplicons SampleNoiseGenes DescriptiveStatistics SampleRatio RatioMatrix AmplProb RemSigGenes BackgroundReport ReportTables ReliableStatus roundDf Background enforceDeterministicResult AmplProbMultipeSamples NonSignificantGeneIndex CheckSignificance HaveMininumNumberOfAmplicons ConsensusCheck BootList GeneMeanRatioMatrix GenePositionMean PositionMean SubsamplingPositions ratiosMean adjustedLength ReadXLSXToList WriteListToXLSX IndexMultipleBams BamIndexFilepaths IndexGenesPositions NormalizeCounts tss VerifiyIfOutputDirectoryExistsOrIsNotEmpty CombinedNormalizedCounts ReadCountsFromBam BedToGenomicRanges

Documented in Background BedToGenomicRanges BootList CNVPanelizer CNVPanelizerFromReadCounts CNVPanelizerFromReadCountsHELPER CollectColumnFromAllReportTables CombinedNormalizedCounts IndexMultipleBams NormalizeCounts PlotBootstrapDistributions ReadCountsFromBam ReadXLSXToList ReportTables SelectReferenceSetByInterquartileRange SelectReferenceSetByKmeans SelectReferenceSetByPercentil SelectReferenceSetFromReadCounts StatusHeatmap WriteListToXLSX

###############################################################################
# functions to read the read counts
###############################################################################

BedToGenomicRanges <- function(panelBedFilepath,
                               ampliconColumn,
                               split = "_",
                               doReduce = TRUE,
                               rangeExtend = 0,
                               dropChromossomes = NA,
                               skip = 1) {
  #load the bed file
  segments <- read.table(panelBedFilepath,
                         sep = "\t",
                         as.is = TRUE,
                         skip = skip)
  gr <- GRanges(segments[, 1], IRanges(segments[, 2], segments[, 3]))

  #make sure that all regions are more than one read apart
  GenomicRanges::start(gr) <- GenomicRanges::start(gr) - rangeExtend
  GenomicRanges::end(gr) <- GenomicRanges::end(gr) + rangeExtend

  if(doReduce) {
    reducedGr <- GenomicRanges::reduce(gr, ignore.strand = TRUE)

    #get the genes for each reduced range
    hitsAnnotation <- GenomicRanges::findOverlaps(reducedGr, gr,
                                                  select = "first",
                                                  ignore.strand=TRUE)
    #get the amplicon names from the segments
    amplicons = segments[hitsAnnotation, ampliconColumn]
    #replace the original gr with the reduced gr
    gr <- reducedGr
  } else{
    amplicons = segments[ , ampliconColumn]
  }

  #get the genes from segments
  splitted = strsplit(amplicons, split = split)
  # Required because of package checking complaints
  i <- NULL
  genes = foreach(i = seq_along(splitted)) %do% {
    splitted[[i]][1]
  }

  genes = unlist(genes)
  # missing the dollar symbol before text because of groovy script
  elementMetadata(gr)["geneNames"] = genes
  elementMetadata(gr)["ampliconNames"] = paste(seqnames(gr),
                                               start(gr),
                                               end(gr),
                                               sep = "_",
                                               amplicons)

  if (!is.na(dropChromossomes)) {
    gr <- dropSeqlevels(gr, dropChromossomes)
  }
  return(gr)
}

#load the files you want to analyze
ReadCountsFromBam <- function(bamFilenames,
                              sampleNames,
                              gr,
                              ampliconNames,
                              minimumMappingQuality = 20,
                              removeDup = FALSE) {

  if (missing(sampleNames)) {
    sampleNames = bamFilenames
  }

  if (missing(ampliconNames)) {
    ampliconNames = elementMetadata(gr)$ampliconNames
  }

  bamIndexFilepaths <- BamIndexFilepaths(bamFilenames)

  # Because of package check complaints
  i <- NULL
  curbam = foreach(i = seq_along(bamFilenames), .combine = cbind) %do% {

    if (!file.exists(bamIndexFilepaths[i])) {
      message(paste("Bai file not found. Creating bai file for", bamFilenames[i], "..."))
      IndexMultipleBams(bamFilenames)
    }

    message(paste("Reading counts for bam file: ", bamFilenames[i]))
    countBamInGRanges(bamFilenames[i],
                      gr,
                      remove.dup = removeDup,
                      min.mapq = minimumMappingQuality,
                      get.width = TRUE)
  }

  listOfBamsHasOnlyOneElement <- length(bamFilenames) == 1
  if (listOfBamsHasOnlyOneElement) {
    curbam <- matrix(curbam)
  }

  colnames(curbam) = sampleNames
  rownames(curbam) = ampliconNames
  return(curbam)
}

#get the combined counts
CombinedNormalizedCounts <- function (sampleCounts,
                                      referenceCounts,
                                      method = "tmm",
                                      ampliconNames = NULL) {
  sampleCounts <- as.matrix(sampleCounts)
  referenceCounts <- as.matrix(referenceCounts)
  allCounts <- cbind(sampleCounts, referenceCounts)
  classes <- rep(c("samples", "reference"), c(ncol(sampleCounts),
                                              ncol(referenceCounts)))
  bamDataRangesNorm <- NormalizeCounts(allCounts, method = method)
  #  bamDataRangesNorm <- NormalizeCounts(allCounts)
  if (!is.null(ampliconNames)) {
    rownames(bamDataRangesNorm) = ampliconNames
  }
  normalizedSamples <- bamDataRangesNorm[, which(classes == "samples"), drop = FALSE]
  normalizedReference <- bamDataRangesNorm[, which(classes == "reference"), drop = FALSE]
  return(list(samples = normalizedSamples, reference = normalizedReference))
}

###############################################################################
# helper functions
###############################################################################

VerifiyIfOutputDirectoryExistsOrIsNotEmpty <- function(outputDir) {
  outputDirectoryExists <- file.exists(outputDir)
  outputDirectoryIsNotEmpty <- length(dir(outputDir, all.files=TRUE)) != 0
  if (outputDirectoryExists || outputDirectoryIsNotEmpty) {
    stop("Output directory already exists or is not empty. Please delete or change output directory")
  }
}

tss <- function(allCounts) {
  return(apply(allCounts, 2, function(x) {x / sum(x)}))
}

NormalizeCounts <- function(allCounts, method = "tmm") {
  # TODO check if normalization method is supported
  allowedNormalizationMethods <- c("tmm", # trimmed mean of M-values
                                   "tss") # total sum scaling
  tinyValueToAvoidZeroReadCounts <- 1e-05
  bamDataRangesNorm <- NULL
  if (method == "tmm") {
    bamDataRangesNorm <- tmm(allCounts, refColumn = NULL, k = tinyValueToAvoidZeroReadCounts)
  } else if (method == "tss") {
    #    bamDataRangesNorm <- apply(allCounts, 2, function(x) {x / sum(x)})
    bamDataRangesNorm <- tss(allCounts)
  } else {
    message(paste("method", method, "not included in the allowed Normalization methods", allowedNormalizationMethods))
  }
  return(bamDataRangesNorm)
}

#get the positions for each gene as a list
IndexGenesPositions = function(genes) {
  positions = seq_along(genes)
  genesPos = split(positions, genes)
  return(genesPos)
}

# #how many numbers in a vector are above a given cutoff
# PercentAboveValue <- function(vector, value) {
#     sum(vector > value)/length(vector)
# }
#
# #how many numbers in a vector are below a given cutoff
# PercentBelowValue <- function(vector, value) {
#     sum(vector < value)/length(vector)
# }

BamIndexFilepaths <- function(bamFilepaths, indexType = ".bam.bai", ignoreCase = FALSE) {
  return (gsub(".bam$",
               bamFilepaths,
               replacement = indexType,
               ignore.case = ignoreCase))
}

# check this params with Thomas
#index the bam files if there is no index yet
IndexMultipleBams <- function(bams,
                              index_type = ".bam.bai") {
  #check if the index already exists and need to be indexed
  #     potentialBaiFilenames <- gsub(".bam$",
  #                                 bams,
  #                                 replacement = index_type,
  #                                 ignore.case = TRUE)

  potentialBaiFilenames <- BamIndexFilepaths(bams, indexType = index_type)

  print(paste("potentialBaiFilenames", potentialBaiFilenames))

  bamsToBeIndexed <- bams[!sapply(potentialBaiFilenames, file.exists)]
  if(length(bamsToBeIndexed) > 0) {
    #index the bams
    #if(multicore) {
    # check with Thomas if we can replace by this..
    #  bplapply(bamsToBeIndexed, indexBam)
    #mclapply(bamsToBeIndexed, indexBam, mc.cores = ncores)
    #} else {
    lapply(bamsToBeIndexed, indexBam)
    #}
  }
}

WriteListToXLSX <- function(listOfDataFrames, multipleFiles = FALSE, outputFolder = file.path(getwd(), "xlsx"), filepath = "list.xlsx") {
  if (multipleFiles) {
    dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE)
    for(name in names(listOfDataFrames)){
      message(paste0("Saving file to '", file.path(outputFolder, paste0(name, ".xlsx"))))
      write.xlsx(listOfDataFrames[[name]], file = file.path(outputFolder, paste0(name, ".xlsx")), rowNames = TRUE)
    }
  } else {
    message(paste0("Saving file to '", filepath, "'"))
    write.xlsx(listOfDataFrames, file = filepath, rowNames = TRUE)
  }
}

ReadXLSXToList <- function(filepath, rowNames = TRUE, colNames = TRUE) {
  listOfDataFrames <- list()
  for(name in getSheetNames(filepath)) {
    listOfDataFrames[[name]] <- read.xlsx(filepath,
                                          rowNames = rowNames,
                                          colNames = colNames,
                                          sheet = name)
  }
  return(listOfDataFrames)
}

###############################################################################
# functions for bootstrapping
###############################################################################

adjustedLength <- function(a) {
  round(sqrt(length(a)))
}

ratiosMean <- function(ratios) {
  return(exp(mean(log(ratios))))
}

#a function that randomly samples positions from a vector using subsampling
SubsamplingPositions <- function(pos, mtry = adjustedLength(pos)) {
  return(sample(pos, mtry, replace = FALSE))
}

#calculates the mean from a vector given specific positions
PositionMean <- function(position, vector) {
  return(ratiosMean(vector[position]))
}

#calculate the mean for each gene
GenePositionMean <- function(genesPos, vector) {
  means = sapply(genesPos, PositionMean, vector = vector)
  names(means) = names(genesPos)
  return(means)
}

#calculates the mean from a vector given specific positions for a matrix
GeneMeanRatioMatrix <- function(genesPos, ratioMatrix) {
  # Package check complaints
  i <- NULL
  ratioList =  foreach(i = 1:ncol(ratioMatrix), .combine = cbind) %do% {
    GenePositionMean(genesPos,ratioMatrix[, i])
  }
  ratioList <- as.matrix(ratioList)
  colnames(ratioList) <- colnames(ratioMatrix)
  return(ratioList)
}

# ReferenceWeights <- function(refmat, varianceFunction = sd) {
#     variance = apply(refmat, 2, varianceFunction)
#     weights = 1/variance
#     return(weights)
# }

# split into a function generating the bootstrap distribution
# and a function averaging over amplicons.
# define the function to generate bootstrap distributions and
# return a list with the genewise bootstrapping
BootList <- function(geneNames, sampleMatrix, refmat, replicates) {
  enforceDeterministicResult()
  # get the genes positions in the matrix as a list from a gene name vector
  genesPos <- IndexGenesPositions(geneNames)

  if (class(sampleMatrix) != "matrix") {
    stop(paste("Parameter 'sampleMatrix' has to be of class matrix and is of class", class(sampleMatrix)))
  }

  if (is.null(colnames(sampleMatrix)) | length(colnames(sampleMatrix))!=ncol(sampleMatrix)) {
    stop("All columns of 'sampleMatrix' have to be named")
  }

  # a vector and matrix are not the same and for a vector iterating over
  # the column makes no sense so we have
  # to check if a matrix or a vector was passed. ncol only works for matrix
  # not for vector
  if (class(sampleMatrix) == "matrix") {
    iterator <- 1:ncol(sampleMatrix)
    # } else {
    #     iterator <- 1
  }

  i <- NULL
  j <- NULL
  bootListSamples <- foreach(i = iterator) %:%
    foreach(j = rep(1, replicates), .combine = rbind) %do% {
      # a vector and matrix are not the same and for a vector
      # iterating over the column makes no sense so we have to check
      # if a mtrix or a vector was passed.
      if (class(sampleMatrix) == "matrix") {
        testSample <- sampleMatrix[, i]
        # } else {
        #     testSample <- sampleMatrix
      }
      # for each gene subsample the amplicon positions independently
      # sample the samples using bootstrapping
      sampleBootPos <- sample(1:ncol(refmat),
                              ncol(refmat),
                              replace = TRUE)
      geneBootPos <- c(lapply(genesPos,
                              SubsamplingPositions),
                       recursive = TRUE)
      # given the obtained sampling using the bootstraps calculated
      refMatPos <- refmat[geneBootPos, sampleBootPos, drop=FALSE]

      bootRatio <- testSample[geneBootPos]/rowMeans(refMatPos)

      # after the bootstrapping the gene positions in the vector
      # changes so recalculate them
      splitClass <- rep(names(genesPos), sapply(genesPos,
                                                adjustedLength))
      newGenesPos <- split(seq_along(splitClass), splitClass)
      sapply(newGenesPos, PositionMean, vector = bootRatio)
    }

  names(bootListSamples) <- basename(colnames(sampleMatrix))

  # In case a single step of bootstrap is done (for testing purposes),
  # the elements of each sample will be converted to a matrix
  for (i in seq_along(bootListSamples)) {
    bootListSamples[[i]] <- matrix(bootListSamples[[i]], ncol = length(names(genesPos)))
    colnames(bootListSamples[[i]]) <- names(genesPos)
  }
  return(bootListSamples)
}



ConsensusCheck <- function(genomicRangesFromBed, sampleMatrix, referenceMatrix) {
  #  sampleMatrix <- samplesNormalizedReadCounts
  #  referenceMatrix <- referenceNormalizedReadCounts

  #  View(sampleMatrix[, c(1,2)])
  #  View(apply(referenceMatrix, 1, median))

  #  ConsensusCheck(genomicRangesFromBed, [email protected], [email protected]),

  #  sampleMatrix <- [email protected]
  #  referenceMatrix <- [email protected]


  # This should not reach here.. but just in case..
  colnames(sampleMatrix) <- basename(colnames(sampleMatrix))
  colnames(referenceMatrix) <- basename(colnames(referenceMatrix))

  geneNames <- genomicRangesFromBed$geneNames

  allSampleRatios <- data.frame()
  for (smpl in colnames(sampleMatrix)) {
    #    smpl <- colnames(sampleMatrix)[which(grepl("28208", colnames(sampleMatrix)))]
    ampliconsRatio <- sampleMatrix[, smpl]/apply(referenceMatrix, 1, median)
    allSampleRatios <- rbind(allSampleRatios, ampliconsRatio)
  }

  allSampleRatios <- t(allSampleRatios)
  rownames(allSampleRatios) <- rownames(sampleMatrix)
  colnames(allSampleRatios) <- colnames(sampleMatrix)

  genesPositions <- IndexGenesPositions(geneNames)

  geneConsensus <- data.frame(matrix(nrow=length(genesPositions), ncol=ncol(sampleMatrix)))
  geneConsensus <- data.frame()
  for (smpl in colnames(allSampleRatios)) {
    for (gene in names(genesPositions)) {
      geneAmpliconsRatios <- allSampleRatios[,smpl][genesPositions[[gene]]]
      ampliconConsensus <- all(geneAmpliconsRatios>1) | all(geneAmpliconsRatios<1)
      geneConsensus[gene, smpl] <- ampliconConsensus
    }
  }
  # TODO case the reference has amplicons with 0 read counts, it returns NA instead of boolean
  geneConsensus[is.na(geneConsensus)] <- FALSE
  return(geneConsensus)
}

HaveMininumNumberOfAmplicons <- function(genomicRangesFromBed, numberOfAmplicons) {
  geneNames <- genomicRangesFromBed$geneNames
  genesPositions <- IndexGenesPositions(geneNames)
  sapply(genesPositions, function(x) {length(x)>=numberOfAmplicons})
}

#calculate significance
CheckSignificance <- function(bootList, significanceLevel = 0.05) {
  # Bonferroni correction
  significanceLevel <- significanceLevel/ncol(bootList[[1]])
  margin <- significanceLevel/2

  # package check complains
  i <- NULL
  j <- NULL
  sigTables =  foreach(i = seq_along(bootList)) %:%
    foreach(j = 1:ncol(bootList[[i]]), .combine = rbind) %do% {
      bootRatioDistribution <- bootList[[i]][,j]
      #      meanNoise <- exp(mean(log(bootRatioDistribution)))
      meanNoise <- ratiosMean(bootRatioDistribution)
      lowerBound <- quantile(bootRatioDistribution, margin, type = 1)
      upperBound <- quantile(bootRatioDistribution, 1 - margin, type = 1)
      bounds <- c(lowerBound, meanNoise, upperBound)
      roundedBounds <- round(bounds, digits = 2)
      maybeAmplification <- roundedBounds[1] > 1
      maybeDeletion <- roundedBounds[3] < 1
      roundedSignificant <- maybeAmplification | maybeDeletion
      copyNumberPutativeStatus <- "Normal"
      if(maybeAmplification) {
        copyNumberPutativeStatus <- "Amplification"
      } else {
        if (maybeDeletion) {
          copyNumberPutativeStatus <- "Deletion"
        }
      }
      sigTable <- c(roundedBounds, roundedSignificant, copyNumberPutativeStatus)
      names(sigTable)  <- c("lowerBound", "mean", "upperBound", "isSig", "putativeStatus")
      sigTable
    }
  names(sigTables) <- names(bootList)

  # name the genes in the list. All samples should share the same gene names
  geneNames <- colnames(bootList[[1]])
  for (i in seq_along(sigTables)) {
    rownames(sigTables[[i]]) <- geneNames
  }
  return(sigTables)
}

#SignificantGenes <- function(sigList, genesPositionsIndex) {
#  # package check complains
#  i <- NULL
#  sigGenes = foreach(i = seq_along(sigList)) %do% {
#    names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE]
#  }
#  return(sigGenes)
#}

# #############################################################################
# # functions for background noise estimation
# #############################################################################
NonSignificantGeneIndex <- function(sigList, genesPositionsIndex) {
  # package check complains
  i <- NULL
  genePosNonSig = foreach(i = seq_along(sigList)) %do% {
    sigGenes = names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE]
    selectedIndex <- RemSigGenes(genesPositionsIndex, sigGenes)
    if (length(selectedIndex) == 0) {
      genesPositionsIndex
    } else {
      selectedIndex
    }
  }
  names(genePosNonSig) <- names(sigList)
  return(genePosNonSig)
}

AmplProbMultipeSamples <- function(genePosNonSig) {
  # package check complains
  i <- NULL
  amplWeights = foreach(i = seq_along(genePosNonSig)) %do% {
    AmplProb(genePosNonSig[[i]])
  }
  names(amplWeights) <- names(genePosNonSig)
  return(amplWeights)
}

enforceDeterministicResult <- function() {
  set.seed(1)
}

Background <- function(geneNames,
                       samplesNormalizedReadCounts,
                       referenceNormalizedReadCounts,
                       bootList,
                       replicates = 1000,
                       significanceLevel = 0.05,
                       robust = FALSE) {
  enforceDeterministicResult()
  #which genes showed significant changes
  sigList <- CheckSignificance(bootList)
  # gene index
  genesPositionsIndex <- IndexGenesPositions(geneNames)
  # calculate the reference mean
  #refMean <- apply(referenceNormalizedReadCounts, 1, mean)
  refMean <- rowMeans(referenceNormalizedReadCounts)
  # calculate the ratio matrix for each sample
  ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean)
  # remove the significant genes from the noise estimation
  genesPosNonSig <- NonSignificantGeneIndex(sigList, genesPositionsIndex)
  # calculate the weight for each amplicon of non significant changed genes
  amplWeights <- AmplProbMultipeSamples(genesPosNonSig)
  uniqueAmpliconNumbers <- NumberOfUniqueAmplicons(genesPositionsIndex)

  sampleIndex <- NULL
  backgroundObject <- foreach(sampleIndex = seq_along(genesPosNonSig)) %do% {
    message(paste0("Calculating Background for ", colnames(samplesNormalizedReadCounts)[sampleIndex]))
    nonSigAmpliconRatios <- ratioMatrix[unlist(genesPosNonSig[[sampleIndex]]), sampleIndex]
    IterateAmplNum(uniqueAmpliconNumbers,
                   nonSigAmpliconRatios,
                   replicates = replicates,
                   probs = amplWeights[[sampleIndex]],
                   significanceLevel = significanceLevel,
                   robust = robust)
  }
  return(backgroundObject)
}

roundDf <- function(x, digits) {
  # round all numeric variables
  # x: data frame
  # digits: number of digits to round
  #  numeric_columns <- sapply(x, mode) == 'numeric'
  numeric_columns <- sapply(x, is.numeric)
  x[numeric_columns] <-  round(x[numeric_columns], digits)
  x
}

# Helper function to add column to reportTables
ReliableStatus <- function(putative, numberOfThresholdsPassed) {
  status <- as.vector(putative)
  #  print(status)
  if (numberOfThresholdsPassed < 2) {
    status <- "Normal"
  }
  return(status)
}

ReportTables <- function(geneNames,
                         samplesNormalizedReadCounts,
                         referenceNormalizedReadCounts,
                         bootList,
                         backgroundNoise) {
  # get the background noise in a format that can be used
  # for a report table
  backgroundReport <- BackgroundReport(backgroundNoise, geneNames)
  # gene index
  genesPositionsIndex <- IndexGenesPositions(geneNames)
  # calculate the reference mean
  #refMean <- apply(referenceNormalizedReadCounts, 1, mean)
  refMean <- rowMeans(referenceNormalizedReadCounts)
  # calculate the ratio matrix for each sample
  ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean)
  # calculate the genewise ratio matrix from the ratio_mat
  ratioMatGene <- GeneMeanRatioMatrix(genesPositionsIndex, ratioMatrix)
  sigList <- CheckSignificance(bootList)
  # because package generaton complains..
  i <- NULL
  reportTables <- foreach(i = seq_along(backgroundReport)) %do% {
    #isSig <- (sigList[[i]][, "upperBound"] < 1) | (sigList[[i]][, "lowerBound"] > 1)
    putativeStatus <- sigList[[i]][, "putativeStatus"]
    isSig <- putativeStatus != "Normal"

    backgroundUp <- backgroundReport[[i]][, "UpperNoise"]
    backgroundDown <- backgroundReport[[i]][, "LowerNoise"]
    rMatGene <- ratioMatGene[, i]
    # TODO is this correct?! why compare rMatGene > 1 ?
    aboveNoise <- (rMatGene > 1 & sigList[[i]][, "lowerBound"] > backgroundUp) |
      (rMatGene < 1 & sigList[[i]][, "upperBound"] < backgroundDown)

    dfTemp <- data.frame(meanRatio = ratioMatGene[, i],
                         lowerBoundBootstrapRatio = sigList[[i]][, "lowerBound"],
                         meanBootstrapRatio = sigList[[i]][, "mean"],
                         upperBoundBootstrapRatio = sigList[[i]][, "upperBound"],
                         #                          lowerNoise = backgroundReport[[i]][, 1],
                         #                          meanNoise = backgroundReport[[i]][, 2],
                         #                          upperNoise = backgroundReport[[i]][, 3],
                         lowerNoise = backgroundReport[[i]][, "LowerNoise"],
                         meanNoise = backgroundReport[[i]][, "MeanNoise"],
                         upperNoise = backgroundReport[[i]][, "UpperNoise"],
                         significant = isSig,
                         aboveNoise = aboveNoise,
                         amplNum = as.vector(table(geneNames)),
                         putativeStatus = putativeStatus,
                         passed = isSig + aboveNoise)

    significativeNumbers <- 2
    dfTemp <- roundDf(dfTemp, significativeNumbers)
    names(dfTemp) <- c("MeanRatio",
                       "LowerBoundBoot",
                       "MeanBoot",
                       "UpperBoundBoot",
                       "LowerNoise",
                       "MeanNoise",
                       "UpperNoise",
                       "Signif.",
                       "AboveNoise",
                       "Amplicons",
                       "PutativeStatus",
                       "Passed")
    dfTemp
  }

  # TODO Check if this column added is correct
  for(i in 1:length(reportTables)) {
    tmpReportTable <- reportTables[[i]]
    #  tmpReportTable <- reportTables[[1]]
    status <- mapply(ReliableStatus, tmpReportTable$PutativeStatus, tmpReportTable$Passed)
    #  print(i)
    reportTables[[i]] <- cbind(reportTables[[i]], "ReliableStatus" = status)
  }

  #    names(reportTables) <- colnames(samplesNormalizedReadCounts)
  names(reportTables) <- names(bootList)
  return(reportTables)
}

BackgroundReport <- function(background, geneNames) {
  # because package generation complains..
  i <- NULL
  j <- NULL
  backgroundReport = foreach(i = seq_along(background)) %:%
    foreach(j = table(geneNames), .combine = rbind) %do% {
      background[[i]][[as.character(j)]]
    }
}

#remove genes that were considered significant by the algorithm
RemSigGenes <- function(genesPos, sigGenes) {
  #if some of the genes were reported with a significantly different read count
  #should not be used for
  #background testing
  #which genes are in the genes pos object
  geneNames= names(genesPos)
  #define a function for not in
  `%ni%` = Negate(`%in%`)
  #which of these genes did not show a significant read count  change
  nonSigGenes = which(geneNames %ni% sigGenes)
  #keep only the genes that did not show a significant read coount change
  nonSigGenesPos = genesPos[nonSigGenes]
  return(nonSigGenesPos)
}

AmplProb <- function(genesPos) {
  #how many amplicons where used for each ofhte genes
  geneCounts = countAmplicons(genesPos)
  #adjust the probablity for depending on the number of amplicons for each gene
  genePerc =  1/geneCounts
  #this information has to be available for each stable position
  ampliconProb = rep(genePerc, geneCounts)
  return(ampliconProb)
}

RatioMatrix <- function(sampleMat, refMean) {
  apply(sampleMat, 2, `/`, refMean)
}

SampleRatio <- function(ratios, numAmpl, amplWeights = NULL) {
  replace <- FALSE
  if(numAmpl >= length(ratios)) {
    replace <- TRUE
  }
  randomPos = sample(seq_along(ratios),
                     numAmpl,
                     prob = amplWeights,
                     replace = replace)
  randomlySelectedRatios = ratios[randomPos]
  #  randomMean = exp(mean(log(randomlySelectedRatios)))
  randomMean <- ratiosMean(randomlySelectedRatios)
  return(randomMean)
}

DescriptiveStatistics <- function(distribution, robust = FALSE) {
  #	print(paste("Robust is ", robust))
  if (robust) {
    centralTendency <- median(distribution)
    variability <- mad(distribution, constant = 1) # median absolute deviation
  } else {
    centralTendency <- mean(distribution)
    variability <- sd(distribution)
  }
  return(list(centralTendency = centralTendency,
              variability = variability))
}

SampleNoiseGenes <- function(numAmpl = 2,
                             ratios,
                             replicates = 100,
                             probs = NULL,
                             significanceLevel = 0.05,
                             robust = FALSE) {
  margin <- significanceLevel / 2
  # now repeat the sampling for the selected number of
  # amplicons replicates
  sampleNoiseDistribution = replicate(replicates,
                                      SampleRatio(ratios,
                                                  numAmpl,
                                                  amplWeights = probs))
  #get the upper, mean and lower values of the noise distribution
  logSampleNoiseDistribution <- log(sampleNoiseDistribution)

  ds <- DescriptiveStatistics(logSampleNoiseDistribution, robust)
  logMeanNoise <- ds$centralTendency
  logSdNoise <- ds$variability

  # if (robust) {
  #     logMeanNoise <- median(logSampleNoiseDistribution)
  #     logSdNoise <- mad(logSampleNoiseDistribution, constant = 1)
  # } else {
  #     logMeanNoise <- mean(logSampleNoiseDistribution)
  #     logSdNoise <- sd(logSampleNoiseDistribution)
  # }

  lowerBound <- exp(logMeanNoise + qnorm(margin) * logSdNoise)
  meanNoise <- exp(logMeanNoise)
  upperBound <- exp(logMeanNoise + qnorm(1 - margin) * logSdNoise)

  sampledNoise = c(lowerBound, meanNoise, upperBound)
  names(sampledNoise) = paste0(c("Lower", "Mean", "Upper"), "Noise")
  return(sampledNoise)
}

countAmplicons <- function(x) {
  return(sapply(x, NROW))
}

# Returns a list with the unique number of amplicons for all genes
NumberOfUniqueAmplicons <- function(genesPos) {
  # calcUniqueAmpliconNumbers <-function(genesPos) {
  ampliconNumbers = countAmplicons(genesPos)
  uniqueAmpliconNumbers = sort(unique(ampliconNumbers))
  return(uniqueAmpliconNumbers)
}

#now calculate the background for each of the unique amplicon numbers
# ratios: the
IterateAmplNum <- function(uniqueAmpliconNumbers,
                           ratios,
                           replicates = 100,
                           probs = NULL,
                           significanceLevel = 0.05,
                           robust = FALSE) {
  # Needed because of package check complaints..
  i <- NULL
  noiseResults = foreach(i = seq_along(uniqueAmpliconNumbers)) %do% {
    sampledNoise = SampleNoiseGenes(uniqueAmpliconNumbers[i],
                                    ratios = ratios,
                                    replicates = replicates,
                                    probs = probs,
                                    significanceLevel = significanceLevel,
                                    robust = robust)
    sampledNoise
  }
  names(noiseResults) = as.character(uniqueAmpliconNumbers)
  return(noiseResults)
}

SelectReferenceSetByPercentil <- function(allSamplesReadCounts,
                                          normalizationMethod = "tmm",
                                          lowerBoundPercentage = 1,
                                          upperBoundPercentage = 99) {

  allSamplesReadCountsNormalized <-NormalizeCounts(allSamplesReadCounts,
                                                   method = normalizationMethod)

  selectedSamplesFilepath <- NULL
  allSamples <- round(allSamplesReadCountsNormalized)
  samplesColnames <- colnames(allSamples)

  selectedRange <- list()
  for (i in 1:nrow(allSamples)) {
    #    selectedRange[[i]] <- quantile(allSamples[i,])
    selectedRange[[i]] <- quantile(allSamples[i,], c(lowerBoundPercentage, upperBoundPercentage) / 100)
  }

  selectedSamplesIndex <- NULL
  excludedSamplesIndex <- NULL
  for (j in 1:ncol(allSamples)) {
    addSample <- TRUE
    for(i in 1:nrow(allSamples)) {
      if ((allSamples[i, j] < selectedRange[[i]][1]) | (allSamples[i, j] > selectedRange[[i]][2])) {
        addSample <- FALSE
        #        print(paste("Add sample?", addSample, i, selectedRange[[i]][1], allSamples[i, j], selectedRange[[i]][2]))
        break
      }
    }
    if (addSample) {
      selectedSamplesIndex <- cbind(selectedSamplesIndex, j)
    } else {
      excludedSamplesIndex <- cbind(excludedSamplesIndex, j)
    }
  }

  selectedSamples <- allSamples[, selectedSamplesIndex]
  excludedSamples <- allSamples[, -excludedSamplesIndex]
  selectedSamplesFilepath <- colnames(selectedSamples)
  return(selectedSamplesFilepath)
}

SelectReferenceSetByInterquartileRange <- function(allSamplesReadCounts, normalizationMethod = "tmm", iqrFactor = 1) {
  allSamplesReadCounts <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)
  fivenumReads <- t(apply(allSamplesReadCounts, 1, fivenum))
  colnames(fivenumReads) <- c("SampleMinimum", "LowerQuartile", "Median", "UpperQuartile", "SampleMaximum")
  iqr <- fivenumReads[, "UpperQuartile"] - fivenumReads[, "LowerQuartile"]
  dispersion <- iqrFactor * iqr
  sampleIndexes <- NULL
  for (i in 1:ncol(allSamplesReadCounts)) {
    if (all((allSamplesReadCounts[, i] >= (fivenumReads[,"LowerQuartile"] - dispersion ))  & ((dispersion + fivenumReads[,"UpperQuartile"])  >= allSamplesReadCounts[, i]))) {
      sampleIndexes <- c(sampleIndexes, i)
    }
  }
  return(colnames(allSamplesReadCounts)[sampleIndexes])
}

SelectReferenceSetByKmeans <- function(allSamplesReadCounts, normalizationMethod = "tmm", referenceNumberOfElements) {
  # referenceSamples <- function(referenceNumberOfElements, allSamples) {

  allSamples <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)

  #  allSamples <- allSamplesReadCounts
  numberOfSamples <- ncol(allSamples)
  numberOfOutliers <- numberOfSamples - referenceNumberOfElements
  if (numberOfOutliers < 0) {
    message(paste("Number of samples", numberOfSamples, "is smaller than the size of the reference set requested", referenceNumberOfElements, ", so all samples will be selected"))
    return(colnames(allSamples))
  } else if (numberOfOutliers == 0) {
    message(paste("Number of samples", numberOfSamples, "is equal to the size of the reference set requested", referenceNumberOfElements))
    return(colnames(allSamples))
  } else {
    allSamples <- t(allSamples)
    kmeans.result <- kmeans(allSamples, centers=1)
    # "centers" is a data frame with one center
    centers <- kmeans.result$centers[kmeans.result$cluster, ]
    distances <- sqrt(rowSums((allSamples - centers)^2))
    outliers <- order(distances, decreasing=TRUE)[1:numberOfOutliers]
    # these rows are top outliers
    message(cat("outliers:", colnames(allSamplesReadCounts)[outliers]))
    allSamples <- t(allSamples)
    #    return(colnames(allSamples[, -outliers, drop = FALSE]))
    return(colnames(allSamples)[-outliers])
  }
}

SelectReferenceSet <- function(samplesFilepaths,
                               genomicRanges,
                               removePcrDuplicates = FALSE,
                               normalizationMethod = "tmm",
                               referenceMaximumNumberOfElements = 10,
                               referenceSelectionMethod = "percentil",
                               lowerBoundPercentage = 1,
                               upperBoundPercentage = 99) {

  #  samplesFilepaths <- allSamplesFilteredFilepaths[1:3]
  #  genomicRanges <- genomicRangesFromBed

  allSamplesReadCounts <- ReadCountsFromBam(bamFilenames = samplesFilepaths,
                                            sampleNames = samplesFilepaths,
                                            gr = genomicRanges,
                                            #                                            ampliconNames = ampliconNames,
                                            removeDup = removePcrDuplicates)

  selectedSamplesFilepath <- SelectReferenceSetFromReadCounts(allSamplesReadCounts,
                                                              #                                                              genomicRanges,
                                                              #                                                              removePcrDuplicates = removePcrDuplicates,
                                                              normalizationMethod = normalizationMethod,
                                                              referenceMaximumNumberOfElements = referenceMaximumNumberOfElements,
                                                              referenceSelectionMethod = referenceSelectionMethod,
                                                              lowerBoundPercentage = lowerBoundPercentage,
                                                              upperBoundPercentage = lowerBoundPercentage)
  return(selectedSamplesFilepath)
}

SelectReferenceSetFromReadCounts <- function(allSamplesReadCounts,
                                             #                               genomicRanges,
                                             #                               removePcrDuplicates = FALSE,
                                             normalizationMethod = "tmm",
                                             referenceMaximumNumberOfElements = 30,
                                             #                               referenceSelectionMethod = "percentil",
                                             referenceSelectionMethod = "kmeans",
                                             lowerBoundPercentage = 1,
                                             upperBoundPercentage = 99) {


  selectedSamplesFilepath <- NULL
  if (referenceSelectionMethod == "percentil") {
    message("Selecting reference by percentil")
    selectedSamplesFilepath <- SelectReferenceSetByPercentil(allSamplesReadCounts,
                                                             normalizationMethod = normalizationMethod,
                                                             #                                                              referenceSelectionMethod = referenceSelectionMethod,
                                                             lowerBoundPercentage = lowerBoundPercentage,
                                                             upperBoundPercentage = upperBoundPercentage)
  } else if (referenceSelectionMethod == "kmeans") {
    message("Selecting reference by kmeans")
    selectedSamplesFilepath <- SelectReferenceSetByKmeans(allSamplesReadCounts,
                                                          normalizationMethod = normalizationMethod,
                                                          referenceMaximumNumberOfElements)
  } else {
    message(paste("method", referenceSelectionMethod, "not supported"))
  }
  return(selectedSamplesFilepath)
}

CollectColumnFromAllReportTables <- function(reportTables, columnName) {
  compilation <- NULL
  mySampleNames <- names(reportTables)
  for(name in mySampleNames) {
    tmpReportTable <- reportTables[[name]]
    compilation <- cbind(compilation, as.vector(tmpReportTable[, columnName]))
  }
  colnames(compilation) <- mySampleNames
  rownames(compilation) <- rownames(reportTables[[1]])
  return(compilation)
}

revalueDF <- function(myDataFrame, mappings) {
  myDataFrameColnames <- colnames(myDataFrame)
  for(columnName in myDataFrameColnames) {
    myDataFrame[, columnName] <- revalue(myDataFrame[, columnName], mappings)
  }
  mode(myDataFrame) <- "numeric"
  return(myDataFrame)
}

## TODO check this function
# ReliableAberrationStatusHeatMap <- function(meanBootResults,
#                                             realiableResults,
#                                             filepath = NULL) {
#   meanBootResults <- melanomaMeanBootResults
#   reliableResults <- melanomaCNVPanelizerResults
#   test_that("testing correct dimensions", {
#     expect_equal(colnames(meanBootResults), colnames(reliableResults))
#     expect_equal(rownames(meanBootResults), rownames(reliableResults))
#   })
#
#   kNormalDefaultValue <- 1.0
#   kAmplificationMaximalValue <- 4.0
#   kAmplificationMinimumValue <- 2.0
#
#   for(i in rownames(reliableResults)) {
#     for(j in colnames(reliableResults)) {
#       if (reliableResults[i,j] == "Normal") {
#         #        print(paste("Normal", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue))
#         meanBootResults[i,j] <- kNormalDefaultValue
#       }
#
#       if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] > kAmplificationMaximalValue)) {
#         print(paste("Too high ", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as ", kAmplificationMaximalValue))
#         meanBootResults[i,j] <- kAmplificationMaximalValue
#       }
#
#       if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] < kAmplificationMinimumValue)) {
#         print(paste("Amplification too low", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue))
#         meanBootResults[i,j] <- kNormalDefaultValue
#       }
#     }
#   }
#
#   df.team_data <- melt(meanBootResults)
#   colnames(df.team_data) <- c("Gene", "Sample", "Bootratio")
#   g <- ggplot(data = df.team_data,
#               aes(x = Gene, y = Sample)) +
#     #    geom_tile(aes(fill = scale(Bootratio))) +
#     geom_tile(aes(fill = log(Bootratio))) +
#     #      + opts(theme(axis.text.x = element_text(angle = 180, hjust = 1)))
#     #       scale_fill_gradient2(low="darkblue", high="darkgreen", guide="colorbar") +
#     #      scale_fill_gradient2(low="green4", high="red4", guide="colorbar") +
#     scale_fill_gradient2(low = "darkgreen", mid = "white", high = "darkred") +
#     theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
#     theme(axis.text.y = element_text(size=5))
#   if (!missing(filepath)) {
#     ggsave(filepath)
#   }
# }



# why these colors by default:
StatusHeatmap <- function(dfData,
                          statusColors = c("Deletion" = "blue",
                                           "Normal" = "green",
                                           #                                         " " = "green",
                                           "Amplification" = "red"),
                          header = "Status Heatmap",
                          filepath = "CNVPanelizerHeatMap.png") {

  mappings <- seq(length(statusColors))
  #  mappings <- c(-1, 0, 1)
  names(mappings) <- names(statusColors)
  matrixData <- revalueDF(dfData, mappings)

  valuesUsed <- table(matrixData)
  indexOfColorsUsed <- as.numeric(names(valuesUsed))
  colorsRequired <- unname(statusColors[indexOfColorsUsed])

  # TODO find out why do I need this horrible hack when only one status is available
  if (length(colorsRequired) <= 1) {
    colorsRequired <- c("green", "yellow")
  }

  #  colorsRequired <- as.vector(statusColors[as.numeric(names(table(matrixData)))])

  png(filename = file.path(filepath),    # create PNG for the heat map
      width = 5*300,        # 5 x 300 pixels
      height = 5*300,
      res = 300,            # 300 pixels per inch
      pointsize = 8)        # smaller font size

  lmat <- rbind( c(5,3,4), c(2,1,4) )
  lhei <- c(1.5, 4)
  lhei <- c(1, 10)
  lwid <- c(1.5, 4, 0.75)

  myCexRow <- 0.2

  heatmap.2(matrixData,
            main = header,
            # plot layout
            lmat = lmat,
            lhei = lhei,
            lwid = lwid,
            #          scale = "row",
            #          ColSideColors=heatColors,
            key = FALSE, # whether a color-key should be shown
            #          key = TRUE, # whether a color-key should be shown
            #          key.par=list(match=c(0),
            #             amplification_deletion=c(1, 2, 3, 4),
            #             deletion_amplification=c(5, 6)),
            dendrogram = "none",
            tracecol=NA,
            #          density.info=c("histogram","density","none"),
            #            col = unname(statusColors),
            #            col = c("green"),
            col = colorsRequired,    # if the values are not all revalued the colors have to be passed selectively

            #          col = heatColors,
            #          col = heatColors[matchPoints],
            colsep=1:ncol(matrixData),
            rowsep=1:nrow(matrixData),
            margins = c(15, 5),
            Rowv=FALSE,
            Colv=FALSE,
            #            cexRow=myCexRow
  )

  #   legend("topleft", inset=.02, title= "UOUOU",
  #          "this is legend",
  #          #         fill=topo.colors(3),
  # #         fill=unname(statusColors),
  #          horiz=TRUE, cex=0.8)
  #bottomleft
  #  legend("bottom", inset=.01, title="CNV Status",





  #
  #
  #          legend("left", inset=.01, title="CNV Status",
  # #                legend("left", inset=0, title="CNV Status",
  #          names(statusColors),
  #          #         fill=topo.colors(3),
  #          fill=unname(statusColors),
  #          cex=0.8)
  # #         horiz=TRUE, cex=0.8)
  #




  #   legend("bottomleft", legend=c("Line 1", "Line 2"),
  #          col=c("red", "blue"), lty=1:2, cex=0.8)

  #   legend(1, 95, legend=c("Line 1", "Line 2"),
  #          col=c("red", "blue"), lty=1:2, cex=0.8)

  # in case of mistake this instruction should be called until return null instead of png..
  dev.off()
}

## TODO check
# StatusStability <- function(geneNames, sampleNormalizedReadCounts, tmpReferenceNormalizedReadCounts) {
#   centralTendency <-  matrix(rowMeans(tmpReferenceNormalizedReadCounts), ncol = 1)
#   rownames(centralTendency) <- rownames(tmpReferenceNormalizedReadCounts)
#   referenceAndSampleDifferences <- centralTendency - sampleNormalizedReadCounts
#   rownames(referenceAndSampleDifferences) <- rownames(tmpReferenceNormalizedReadCounts)
#   colnames(referenceAndSampleDifferences) <- "difference"
#   uniqueGeneNames <- unique(geneNames)
#   geneStabilityStatus <- c()
#   for (i in seq(uniqueGeneNames)) {
#     geneAmpliconIndexes <- grep(uniqueGeneNames[i], rownames(myReference))
#     geneAmpliconStatus <- referenceAndSampleDifferences[geneAmpliconIndexes, ]
#     allGeneAmpliconsHaveSameStatus <- length(unique(geneAmpliconStatus >= 0))==1
#     geneStabilityStatus <- c(geneStabilityStatus, allGeneAmpliconsHaveSameStatus)
#   }
#   names(geneStabilityStatus) <- uniqueGeneNames
#   return(geneStabilityStatus)
# }

## TODO CHECK
# StatusStabilityTable <- function(geneNames, tmpSamplesNormalizedReadCounts, tmpReferenceNormalizedReadCounts) {
#   statusStabilityTable <- NULL
#   for (i in seq(ncol(tmpSamplesNormalizedReadCounts))) {
#     statusStabilityTable <- cbind(statusStabilityTable, StatusStability(geneNames, matrix(tmpSamplesNormalizedReadCounts[, i], ncol = 1), tmpReferenceNormalizedReadCounts))
#   }
#   colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts)
#   #   statusStabilityTable <- t(statusStabilityTable)
#   #   colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts)
#   return(statusStabilityTable)
# }

PlotBootstrapDistributions  <- function(bootList,
                                        reportTables,
                                        outputFolder = getwd(),
                                        sampleNames = NULL,
                                        save = FALSE,
                                        #                                        scale = 7) {
                                        scale = 10) {
  selSample <- NULL
  plotList <- foreach(selSample = seq_along(bootList)) %do% {
    test <- as.factor(reportTables[[selSample]][, "Passed"])

    levelLabels <- c("NoChange", "NonReliableChange", "ReliableChange")
    names(levelLabels) <- c(0, 1, 2)
    test <- suppressMessages(revalue(test, levelLabels))
    namedColors <- c("#56B4E9", "#CC79A7" ,"#D55E00")
    names(namedColors) <- levelLabels

    ratios <- NULL
    testsPassed <- NULL
    df <- data.frame(class = as.factor(colnames(bootList[[selSample]])),
                     ratios = (as.vector(t(bootList[[selSample]]))),
                     testsPassed = test)

    #if a genomic ranges object has been
    #if(!is.null(gr)) {
    # newGeneOrder <- bedToGeneOrder(gr)
    # df$class = with(df,factor(class,levels(class)[newGeneOrder]))
    #}

    ylim1 <- boxplot.stats((df$ratios))$stats[c(1, 5)]
    ylim1 <- c(max(ylim1),max(ylim1))
    ylim1 <- log2(ylim1)
    ylim1 <- ylim1 * c(-scale,scale)

    if(is.null(sampleNames)) {
      filename <- names(bootList[selSample])
      filepath <- paste0(outputFolder, "/", filename, "_plot.pdf")
    } else {
      filename <- sampleNames[selSample]
      filepath <- paste0(outputFolder, "/", filename, ".pdf")
    }
    bootPlot <- ggplot(df, aes(x = class, y = log2(ratios), fill = testsPassed)) +
      geom_violin() + ggtitle(filename) +
      theme(plot.title = element_text(lineheight = 0.8, face = "bold"),
            text = element_text(size = 15),
            axis.text.x = element_text(angle = 90)) +
      scale_fill_manual(name = "CNV Reliability",
                        values = namedColors,
                        labels = c("0" = "Foo", "1" = "Bar")) +
      coord_cartesian(ylim = ylim1) +
      theme(axis.text=element_text(size=10),
            axis.title=element_text(size=20,face="bold")) +
      scale_x_discrete("Gene Names") +
      #       geom_hline(yintercept=log2(1.5), color="#009E73") +
      #       geom_hline(yintercept=log2(0.5), color="#009E73") +
      geom_hline(yintercept=0, color="#009E73")

    if(save == TRUE) {
      message(paste0("Saving plot to '", filepath, "'"))
      #            dir.create(outputFolder, recursive = TRUE, showWarnings = TRUE)
      dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE)
      ggsave(filename = filepath,
             plot = bootPlot,
             height = 7.42 * 1,
             width = 8.11 * 2,
             limitsize = FALSE)
    }
    #        dev.off()
    return(bootPlot)
  }
  names(plotList) <- names(reportTables)
  return(plotList)
}

CNVPanelizerFromReadCountsHELPER <- function(sampleReadCounts,
                                             referenceReadCounts,
                                             genomicRangesFromBed,
                                             numberOfBootstrapReplicates = 10000,
                                             normalizationMethod = "tmm",
                                             robust = TRUE,
                                             backgroundSignificanceLevel = 0.05,
                                             outputDir = file.path(getwd(), "CNVPanelizer"),
                                             splitSize = 5) {
  myCNVPanelizerResults <- list()

  splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize))

  #    for (i in 1:ncol(sampleReadCounts)) {

  for (i in 1:length(splits)) {
    myCNVPanelizerResults[[i]] <- CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]],
                                                             referenceReadCounts,
                                                             genomicRangesFromBed,
                                                             numberOfBootstrapReplicates = numberOfBootstrapReplicates,
                                                             normalizationMethod = normalizationMethod,
                                                             robust = robust,
                                                             backgroundSignificanceLevel = backgroundSignificanceLevel,
                                                             outputDir = outputDir)
  }

  n <- length(myCNVPanelizerResults)
  res <- NULL
  for (i in seq(n)) {
    res <- cbind(res, myCNVPanelizerResults[[i]])
  }

  myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]]

  if(length(myCNVPanelizerResults) > 1) {
    for (i in 2:length(myCNVPanelizerResults)) {
      myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts)
      myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList)
      myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise)
      myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables)
    }
  }
  return(myCNVPanelizerResultsCompiled)
}

######################################################################################
# still on testing..
######################################################################################
# CNVPanelizerFromReadCountsHelperParallel <- function(sampleReadCounts,
#                                                      referenceReadCounts,
#                                                      genomicRangesFromBed,
#                                                      numberOfBootstrapReplicates = 10000,
#                                                      normalizationMethod = "tmm",
#                                                      robust = TRUE,
#                                                      backgroundSignificanceLevel = 0.05,
#                                                      outputDir = file.path(getwd(), "CNVPanelizer"),
#                                                      splitSize = 5) {
#
#   myCNVPanelizerResults <- list()
#
#   splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize))
#
#   # Use the detectCores() function to find the number of cores in system
#   no_cores <- detectCores()
#   # Setup cluster
#   clust <- makeCluster(no_cores - 1)
#   clusterExport(clust,
#                 varlist = c(
#                   "sampleReadCounts",
#                   "referenceReadCounts",
#                   "genomicRangesFromBed",
#                   "numberOfBootstrapReplicates",
#                   "normalizationMethod",
#                   "robust",
#                   "backgroundSignificanceLevel",
#                   "outputDir",
#                   "CNVPanelizerFromReadCounts"),
#                 envir = environment())
#
#   myCNVPanelizerResults <- parLapply(clust, 1:length(splits), function(i) {
#     CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]],
#                                referenceReadCounts,
#                                genomicRangesFromBed,
#                                numberOfBootstrapReplicates = numberOfBootstrapReplicates,
#                                normalizationMethod = normalizationMethod,
#                                robust = robust,
#                                backgroundSignificanceLevel = backgroundSignificanceLevel,
#                                outputDir = outputDir)
#   })
#   stopCluster(clust)
#
#   n <- length(myCNVPanelizerResults)
#   res <- NULL
#   for (i in seq(n)) {
#     res <- cbind(res, myCNVPanelizerResults[[i]])
#   }
#
#   myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]]
#
#   if(length(myCNVPanelizerResults) > 1) {
#     for (i in 2:length(myCNVPanelizerResults)) {
#       myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts)
#       myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList)
#       myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise)
#       myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables)
#     }
#   }
#   return(myCNVPanelizerResultsCompiled)
# }

CNVPanelizerFromReadCounts <- function(sampleReadCounts,
                                       referenceReadCounts,
                                       genomicRangesFromBed,
                                       numberOfBootstrapReplicates = 10000,
                                       normalizationMethod = "tmm",
                                       robust = TRUE,
                                       backgroundSignificanceLevel = 0.05,
                                       outputDir = file.path(getwd(), "CNVPanelizer")) {

  #  VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir)


  metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
  geneNames = metadataFromGenomicRanges["geneNames"][, 1]

  #
  # #   metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
  # #   geneNames = metadataFromGenomicRanges["geneNames"][, 1]
  # #   ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1]
  #
  #   sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
  #                                         sampleNames = sampleBamFilepaths,
  #                                         gr = genomicRangesFromBed,
  #                                         removeDup = FALSE)
  #
  #   sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
  #                                         sampleNames = sampleBamFilepaths,
  #                                         gr = genomicRangesFromBed,
  #                                         removeDup = FALSE)

  normalizedReadCounts <- CombinedNormalizedCounts(sampleReadCounts,
                                                   referenceReadCounts,
                                                   method = normalizationMethod
                                                   #                                                   ,                                                   ampliconNames = ampliconNames
  )

  # After normalization data sets need to be splitted again to perform bootstrap
  samplesNormalizedReadCounts = normalizedReadCounts["samples"][[1]]
  referenceNormalizedReadCounts = normalizedReadCounts["reference"][[1]]

  bootList <- BootList(geneNames,
                       samplesNormalizedReadCounts,
                       referenceNormalizedReadCounts,
                       replicates = numberOfBootstrapReplicates)

  # Estimate the background noise left after normalization
  backgroundNoise <- Background(geneNames,
                                samplesNormalizedReadCounts,
                                referenceNormalizedReadCounts,
                                bootList,
                                replicates = numberOfBootstrapReplicates,
                                significanceLevel = backgroundSignificanceLevel,
                                robust = robust)

  # Build report tables
  reportTables <- ReportTables(geneNames,
                               samplesNormalizedReadCounts,
                               referenceNormalizedReadCounts,
                               bootList,
                               backgroundNoise)

  plots = PlotBootstrapDistributions(bootList,
                                     reportTables,
                                     sampleNames = names(reportTables),
                                     outputFolder = file.path(outputDir, "plots"),
                                     save = TRUE)

  WriteListToXLSX(reportTables,
                  multipleFiles = TRUE,
                  outputFolder = file.path(outputDir, "xlsx"))

  # WriteListToXLSX(reportTables, outputFolder =  "", filepath = "list.xlsx") {
  #   write.xlsx(listOfDataFrames, , rowNames = TRUE)
  # }
  #
  #
  #
  #
  # WriteListToXLSX([email protected], filepath = file.path(outputDir, "reportTables.xlsx"))
  # WriteListToXLSX([email protected], filepath = file.path(getwd(), "CNVPanelizer", "reportTables.xlsx"))

  # CNVPanelizerResults <- setClass("CNVPanelizerResults",
  #                                 #                                  setClass("CNVPanelizerResults",
  #
  #                                 # use this function to extract the slot names of the slotNames(results_CNVPanelizer[[1]])
  #                                 slots = c(sampleReadCounts="matrix",
  #                                           referenceReadColunts="matrix",
  #                                           genomicRangesFromBed="GRanges",
  #                                           bootList="list",
  #                                           backgroundNoise="list",
  #                                           plots = "list",
  #                                           reportTables="list"))
  #
  # myCNVPanelizerResults <- CNVPanelizerResults(sampleReadCounts = sampleReadCounts,
  #                                              referenceReadColunts = referenceReadCounts,
  #                                              genomicRangesFromBed = genomicRangesFromBed,
  #                                              bootList = bootList,
  #                                              backgroundNoise = backgroundNoise,
  #                                              plots = plots,
  #                                              reportTables = reportTables)

  myCNVPanelizerResults <- list(sampleReadCounts = sampleReadCounts,
                               referenceReadCounts = referenceReadCounts,
                               genomicRangesFromBed = genomicRangesFromBed,
                               bootList = bootList,
                               backgroundNoise = backgroundNoise,
                               plots = plots,
                               reportTables = reportTables)



  return(myCNVPanelizerResults)
}

CNVPanelizer <- function(sampleBamFilepaths,
                         referenceBamFilepaths,
                         bedFilepath,
                         amplColumnNumber = 6,
                         minimumMappingQuality = 20,
                         numberOfBootstrapReplicates = 10000,
                         removePcrDuplicates = TRUE,
                         #                         analysisMode = "gene",   # analysisMode can be "gene" or "amplicon"
                         robust = TRUE,
                         backgroundSignificanceLevel = 0.05,
                         outputDir = file.path(getwd(), "CNVPanelizer")) {

  #  VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir)

  genomicRangesFromBed <- BedToGenomicRanges(bedFilepath,
                                             ampliconColumn = amplColumnNumber,
                                             split = "_")

  #  metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
  #  geneNames = metadataFromGenomicRanges["geneNames"][, 1]
  #  ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1]

  sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
                                        sampleNames = sampleBamFilepaths,
                                        gr = genomicRangesFromBed,
                                        minimumMappingQuality = minimumMappingQuality,
                                        removeDup = removePcrDuplicates)

  referenceReadCounts <- ReadCountsFromBam(bamFilenames = referenceBamFilepaths,
                                           sampleNames = referenceBamFilepaths,
                                           gr = genomicRangesFromBed,
                                           minimumMappingQuality = minimumMappingQuality,
                                           removeDup = removePcrDuplicates)

  results <- CNVPanelizerFromReadCounts(sampleReadCounts = sampleReadCounts,
                                        referenceReadCounts = referenceReadCounts,
                                        genomicRangesFromBed = genomicRangesFromBed,
                                        #                                        bedFilepath = bedFilepath,
                                        #                                        amplColumnNumber = amplColumnNumber,
                                        #                                       minimumMappingQuality = minimumMappingQuality,
                                        numberOfBootstrapReplicates = numberOfBootstrapReplicates,
                                        #                                       removePcrDuplicates = removePcrDuplicates,
                                        #                                       analysisMode = analysisMode,
                                        robust = robust,
                                        backgroundSignificanceLevel = backgroundSignificanceLevel,
                                        outputDir = outputDir)
  return(results)
}


Strict <- function(cnvPanelizerResults,
                   amplificationMinimumThreshold = 2,
                   deletionMaximumThreshold = 0.6,
                   minimumNumberOfAmplicons = 2
                   #                   , fullConsensusAmongAmplicons = TRUE
) {
#  myCNVPanelizerTableResults <- CollectColumnFromAllReportTables([email protected], "ReliableStatus")
  myCNVPanelizerTableResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, "ReliableStatus")
  myCNVPanelizerTableResults <- t(myCNVPanelizerTableResults)
  myCNVPanelizerTableResults <- revalueDF(myCNVPanelizerTableResults, c("Normal"=""))

  interestingColumn <- "MeanBoot"
#  meanBootResults <- CollectColumnFromAllReportTables([email protected], interestingColumn)
  meanBootResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, interestingColumn)
  class(meanBootResults) <- "numeric"
  meanBootResults <- t(meanBootResults)

#  consensusResults <- ConsensusCheck([email protected], [email protected], [email protected])
  consensusResults <- ConsensusCheck(cnvPanelizerResults$genomicRangesFromBed, cnvPanelizerResults$sampleReadCounts, cnvPanelizerResults$referenceReadCounts)
  consensusResults <- t(consensusResults)

  test_that("colnames from CNVPanelizer results and Consensus match", {
    expect_equal(colnames(consensusResults), colnames(myCNVPanelizerTableResults))
  })

  test_that("rownames from CNVPanelizer results and Consensus match", {
    expect_equal(rownames(consensusResults), rownames(myCNVPanelizerTableResults))
  })

  cNVPanelizerResultsConsensus <- myCNVPanelizerTableResults
  for (rowname in rownames(myCNVPanelizerTableResults)) {
    for (colname in colnames(myCNVPanelizerTableResults)) {
      #        print(paste(rowname, colname))
      if (!consensusResults[rowname, colname]) {
        cNVPanelizerResultsConsensus[rowname, colname] <- ""
        print(paste("Setting ", myCNVPanelizerTableResults[rowname, colname], "to", cNVPanelizerResultsConsensus[rowname, colname]))
      }
    }
  }

  #  table(unlist(myCNVPanelizerTableResults))
  #  table(unlist(cNVPanelizerResultsConsensus))

  test_that("colnames from CNVPanelizer results and meanBootResults match", {
    expect_equal(colnames(meanBootResults), colnames(cNVPanelizerResultsConsensus))
  })

  test_that("rownames from CNVPanelizer results and meanBootResults match", {
    expect_equal(rownames(meanBootResults), rownames(cNVPanelizerResultsConsensus))
  })

  cNVPanelizerResultsConsensusAndThreshold <- cNVPanelizerResultsConsensus
  for (rowname in rownames(cNVPanelizerResultsConsensusAndThreshold)) {
    for (colname in colnames(cNVPanelizerResultsConsensusAndThreshold)) {
      #      print(cNVPanelizerResultsConsensus[rowname, colname])}}
      if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Amplification")
          & ((meanBootResults[rowname, colname] <= amplificationMinimumThreshold))
          #          & ((meanBootResults[rowname, colname] <= 3))
      ) {
        cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- ""
        print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "because", meanBootResults[rowname, colname]))
      }

      if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion")
          & ((meanBootResults[rowname, colname] >= deletionMaximumThreshold))
          #& ((meanBootResults[rowname, colname] >= 0.7))
      ) {
        cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- ""
        print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname]))
      }
    }
  }

  #  table(unlist(cNVPanelizerResultsConsensus))
  #  table(unlist(cNVPanelizerResultsConsensusAndThreshold))

  cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons <- cNVPanelizerResultsConsensusAndThreshold
#  genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons([email protected], minimumNumberOfAmplicons)
  genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons(cnvPanelizerResults$genomicRangesFromBed, minimumNumberOfAmplicons)
  for (rowname in rownames(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)) {
    for (colname in names(genesWithMinimumNumberOfAmplicons)) {
      if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion") &
          !genesWithMinimumNumberOfAmplicons[colname]) {
        cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname] <- ""
        print(paste("Setting ", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "to", cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname]))
      }
    }
  }

  #  table(unlist(cNVPanelizerResultsConsensusAndThreshold))
  #  table(unlist(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons))

  #  write.xlsx(cNVPanelizerResults, file=file.path(outputDir, paste0("cNVPanelizerResults", ".xlsx")), row.names = TRUE)
  #  write.xlsx(cNVPanelizerResultsConsensus, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensus", ".xlsx")), row.names = TRUE)
  #  write.xlsx(cNVPanelizerResultsConsensusAndThreshold, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThreshold", ".xlsx")), row.names = TRUE)
  #  write.xlsx(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons", ".xlsx")), row.names = TRUE)

  return(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)
}


# TODO Could be provided as a measurement of quality for the run
# Overview <- function(
#   #  sampleIdentifier,
#   sampleReadCounts1,
#   referenceReadCounts1,
#   allSamplesReadCounts,
#   genomicRangesFromBed,
#   #  normalizationMethod = "tss",
#   normalizationMethod = "tmm",
#   outputDir = file.path(getwd(), "CNVPanelizer", "overviewKNNReference30")) {
#
#   #  sampleIdentifier <- "I:/Run Data/Run S5-56/M1_S5-56_IonXpress_074_rawlib.bam"
#
#   #   Overview(allSamplesReadCounts[, filepaths],
#   #            allSamplesReadCounts[, referenceReadCounts],
#   #            #         allSamplesReadCounts,
#   #            genomicRangesFromBed,
#   #            #                     normalizationMethod = "tss",
#   #            #                     normalizationMethod = "tmm",
#   #            outputDir = file.path(getwd(), "CNVPanelizer", "overview"))
#
#   sampleReadCounts1 <- allSamplesReadCounts[, filepaths]
#   #  referenceReadCounts1 <- allSamplesReadCounts[, referenceReadCounts]
#   referenceReadCounts1 <- allSamplesReadCounts[, referenceFilepathsKmeans]
#
#   allSamplesReadCountsNormalized <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)
#   samplesNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(sampleReadCounts1)]
#   referenceNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(referenceReadCounts1)]
#
#   runSamples <- samplesNormalizedReadCounts
#   refSamples <- referenceNormalizedReadCounts
#   entireSetOfSamples <- allSamplesReadCountsNormalized
#
#   geneNames <- unique(genomicRangesFromBed$geneNames)
#   ampliconNames <- genomicRangesFromBed$ampliconNames
#   # TODO validate if ampliconNames are the rownames of samples matrices
#   # ampliconNames <- rownames(allSamplesNormalized)
#
#   sampleIdentifiers <- colnames(runSamples)
#   #sampleIdentifiers <- sampleIdentifiers[1]
#   for (sampleIdentifier in sampleIdentifiers) {
#     myDir <- file.path(outputDir, basename(sampleIdentifier))
#     dir.create(myDir, recursive = TRUE)
#     for (geneName in geneNames) {
#       #    geneName <- "BRAF"
#       myAmpliconsIndexes <- grep(geneName, ampliconNames)
#       png(file=file.path(myDir, paste0(geneName,'_amplicons.png')),
#           width=1000,
#           height=(ceiling(length(myAmpliconsIndexes)/2) * 500))
#
#       #    par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1))
#       par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3, 3, 5, 1), oma=c(0,0,7,0))
#
#       for (ampliconName in ampliconNames[myAmpliconsIndexes]) {
#         #  ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333"
#         tmp <- data.frame(y = c(as.vector(runSamples[ampliconName, ])
#                                 ,as.vector(refSamples[ampliconName, ])
#                                 ,as.vector(entireSetOfSamples[ampliconName, ])
#         ),
#
#         x = factor(
#           c(rep("runSamples", length(as.vector(runSamples[ampliconName, ])))
#             ,rep("refSamples",length(as.vector(refSamples[ampliconName, ])))
#             ,rep("allSamples",length(as.vector(entireSetOfSamples[ampliconName, ])))
#           ), levels = c("runSamples", "refSamples", "allSamples"))
#
#         #                         x = c(rep("runSamples", length(as.vector(runSamples[ampliconName, ])))
#         #                               ,rep("refSamples",length(as.vector(refSamples[ampliconName, ])))
#         #                               )
#         )
#
#         myValues <- runSamples[ampliconName, sampleIdentifier]
#
#         boxplot(y~x,
#                 data=tmp,
#                 ylab="Read Counts",
#                 xlab="Sample set",
#                 names = c("Run Samples", "Reference", "All Samples"),
#                 #              main = ampliconName)
#                 main = paste(myValues, ampliconName))
#         stripchart(y ~ x,
#                    vertical = TRUE,
#                    data = tmp,
#                    method = "jitter",
#                    add = TRUE,
#                    #                pch = 20,
#                    pch = 20,
#                    col = 'blue')
#
#
#         #    stripchart(list(0.5, 1), vertical=TRUE, add=TRUE, method="stack", col='red', pch="*")
#         stripchart(myValues, vertical=TRUE, add=TRUE, method="stack", col='red', pch=16, cex=2)
#       }
#       title(main = paste(geneName, "-", sampleIdentifier), font.main = 4, outer=TRUE)
#       dev.off()
#     }
#   }
# }

## TODO CHECK THIS
#
#
# #rowIndexesToMerge <- IndexGenesPositions(lungGenomicRanges$geneNames)
# #GeneMultilinePlot(normalSamples, rowIndexesToMerge, normalizationMethod = "tss")
# GeneMultilinePlot <- function(dataSamples, rowIndexesToMerge, normalizationMethod =  "tmm") {
#   dataSamples <- round(NormalizeCounts(dataSamples))
#   result <- NULL
#   for (i in 1:ncol(dataSamples)) {
#     result <- cbind(result, sapply( rowIndexesToMerge, function(x) {mean(dataSamples[x, i])} ))
#   }
#   colnames(result) <- colnames(dataSamples)
#   dataSamples <- result
#   #  dataSamples <- cbind(geneReadCounts = seq_along(rownames(dataSamples)), dataSamples)
#
#   melted <- melt(dataSamples)
#
#   ggplot(data=melted, aes(x=Var1, y=value, group=Var2, colour=Var2, linetype = Var2)) +
#     #geom_line() +
#     #    scale_colour_manual(values = c('pink','orange','white', 'red')) +
#     theme(legend.position="left") +
#     geom_point(aes(color=Var2)) +
#     geom_line(size=1) +
#     theme(axis.text.x = element_text(angle = 90, hjust = 1))
# }
#

# #Example for testing MultiLinePlot
# dataSamples <- read.table(text = "CN CNVPanelizer panelcn.mops ioncopy
#                  CN1 0.1 0.4 0.4
#                  CN2 0.2 0.1 0.3
#                  CN3 0.1 0.2 0.1
#                  CN4 0.1 0.4 0.4
#                  CN5 0.2 0.1 0.3
#                  CN6 0.1 0.3 0.5
#                  CN7 0.6 0.6 0.6
#                  CN8 0.6 0.3 0.2
#                  CN9 0.6 0.8 0.8
#                  CN10 1 0.9 0.8", header=TRUE)

# TODO reuse this code at GeneMultilinePlot ...
# MultiLinePlot <- function(dataSamples, idColumnIdentifier = NULL, title = "", legendPosition = "left", xLabel = "", yLabel = "", smooth = FALSE) {
#   rs <- dataSamples
#   if (is.null(idColumnIdentifier)) {
#     melted = melt(rs)
#   } else {
#     melted = melt(rs, id.vars=idColumnIdentifier)
#   }
#   melted$CN <- as.character(melted$CN)
#   melted$CN <-factor(melted$CN, levels = unique(melted$CN))
#   myPlot <- ggplot(data=melted, aes(x=CN, y=value, group=variable, colour=variable, linetype = variable)) +
#     scale_colour_discrete(name = "Method") + #  scale_colour_manual(values = c('pink','orange','white', 'red')) +
#     ggtitle(title) +
#     geom_point(aes(color=variable)) + # plots the dots..
#     theme(text = element_text(size=10),
#           axis.text.x = element_text(angle=90, hjust=1)) +
#     theme(panel.border = element_blank(),
#           panel.grid.major = element_blank(),
#           panel.grid.minor = element_blank(),
#           axis.line = element_line(colour = "black")) +
#     theme_bw() + # sets the background colour
#     theme(plot.title = element_text(hjust = 0.5)) +
#     theme(legend.position=legendPosition) + xlab(xLabel) + ylab(yLabel)
#   if(smooth) {
#     #      myPlot = myPlot + geom_smooth(method = "loess", se = FALSE, span = 1.5, show.legend = FALSE)
#     myPlot = myPlot + geom_smooth(method = "auto", se = FALSE, span = 1.5, show.legend = FALSE)
#   } else {
#     myPlot = myPlot + geom_line(size = 1, show.legend = FALSE)    # links the dots with lines..
#   }
#   return(myPlot)
# }


SampleReadCountsPools <- function(ampliconsReadCounts, numberOfPools) {
  listOfPools <- list()
  for (i in 1:numberOfPools) {
    listOfPools[[i]] <- ampliconsReadCounts[seq(from = i, to = length(ampliconsReadCounts), by = numberOfPools)]
  }
  return(listOfPools)
}


SampleReadCountsByPool <-function(ampliconReadCounts, pools) {
  # existingPools <- as.numeric(names(table(pools)))
  # if (any(is.na(existingPools)) {
  #   stop("Pools are not defined as 1, 2, ...")
  # }
  existingPools <- names(table(pools))
  if (length(pools) != length(ampliconReadCounts)) {
    stop("Length  of pools is different than ampliconReadCounts")
  }
  listOfPools <- list()
  for (i in existingPools) {
    listOfPools[[i]] <- (ampliconReadCounts[which(pools==i)])
  }
  return(listOfPools)
}


PoolsPlots <- function(sampleReadCountsByPool) {
  plotValues <- as.integer(unlist(lapply(sampleReadCountsByPool, mean)))
  barplot(plotValues, ylab = "Read Counts mean", xlab="Pools", names.arg = names(sampleReadCountsByPool), col = "dodgerblue4",
          #          ylim=c(0, trunc(max(plotValues)))
  )
}








################################################################################
# KaryotypeAberrationPlot
################################################################################
# KaryotypeAberrationPlot(bedFilepath = bedFilepath,
#                         ampliconColumn = 4,
#                         filepath = "TESTANDO_APAGAR_KARYOTYPE.png")

KaryotypeAberrationPlot <- function(bedFilepath,
                                    ampliconColumn,
                                    status = sample(c("red", "green", "blue"), 30, replace = TRUE),
                                    filepath = "KaryotypeAberrationPlot.png",
                                    width = 1200,
                                    height = 800) {
  #source("https://bioconductor.org/biocLite.R")
  #biocLite("chromPlot")
  #library(chromPlot)
  #library(stringr)
  hg_gap <- NULL # not sure if this is good solution to avoid 'NOTE'
  data(hg_gap)
  getGeneRegionsStatusSummary <- GetGeneRegionsStatusSummary(bedFilepath = bedFilepath,
                                                             ampliconColumn = ampliconColumn,
                                                             status = sample(c("red", "green", "blue"), 30, replace=TRUE))
  geneRegionsBands <- GetGeneRegionsBand(getGeneRegionsStatusSummary)
  png(filepath, width = width, height = height)
  #  chromPlot(gaps=hg_gap)
  #  chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value")
  #  chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom))
  #  chromPlot(gaps=hg_gap, bands=hg_cytoBandIdeo, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom), statName="Value", noHist=TRUE, figCols=5, cex=0.7, statTyp="n", chrSide=c(1,1,1,1,1,1,-1,1))

  # TODO TEMPORARARIO PORQUE O chromPlot nao pode ser instalado em versao 3.2.0 ...
  # karyotypeAberrationPlot <- chromPlot(gaps = hg_gap,
  #                                      bands = geneRegionsBands,
  #                                      chr = unique(getGeneRegionsStatusSummary$Chrom),
  #                                      stat = getGeneRegionsStatusSummary,
  #                                      statCol = "Value",
  #                                      statName = "Value",
  #                                      noHist = TRUE,
  #                                      figCols = 7,
  #                                      cex = 0.7,
  #                                      statTyp = "n",
  #                                      chrSide = c(1,1,1,1,1,1,-1,1))
  karyotypeAberrationPlot <- NA
  dev.off()
  return(karyotypeAberrationPlot)
}


# It Merges all regions related to the same gene and assigns it a color that relates to the aberratiosn status
# Status should be a vector of colors representing the amplifications (red for amplification, green for deletion and blue for normal)
GetGeneRegionsStatusSummary <- function(bedFilepath, ampliconColumn = 4, split = "_", status = c("blue")) {
  a <- read.csv(bedFilepath, sep="\t", skip=1, header = FALSE)
  a[, 1] <- str_replace(a$V1, "chr","")
  a[, ampliconColumn] <- vapply(strsplit(as.vector(a[, ampliconColumn]), "_", fixed = TRUE), "[", "", 1)
  a <- a[,1:4]
  colnames(a) <- c("Chrom", "Start", "End", "ID"
                   #                   , "Colors"
  )
  geneNames <- unique(a$ID)
  geneRegions <- NULL
  for (geneName in geneNames) {
    genomicRegions <- a[a$ID==geneName, ]
    chromossome <- unique(genomicRegions$Chrom)
    start <- min(genomicRegions$Start)
    end <- max(genomicRegions$End)
    gene <- unique(genomicRegions$ID)
    geneRegions <- rbind(geneRegions, c(chromossome, start, end, gene))
  }
  colnames(geneRegions) <- colnames(a)
  geneRegions <- as.data.frame(geneRegions,
                               #                               as.is = TRUE)
                               stringsAsFactors = FALSE)
  geneRegions <- cbind(geneRegions, Colors = status)

  #  geneRegions$Chrom <- as.numeric(geneRegions$Chrom)
  geneRegions$Start <- as.numeric(geneRegions$Start)
  geneRegions$End <- as.numeric(geneRegions$End)
  geneRegions$ID <- as.factor(geneRegions$ID)
  return(geneRegions)
}


# Returns objected needed for plotting.. it changes some column names and makes the regions larger to be able to be visible in the plot..
#GetGeneRegionsBand <- function(geneRegionsStatusSummary) {
GetGeneRegionsBand <- function(geneRegions) {
  geneRegionsBands <- geneRegions
  colnames(geneRegionsBands) <- c("Chrom", "Start", "End", "Name", "Colors")
  geneRegionsBands$Chrom <- paste0("chr", geneRegionsBands$Chrom)
  geneRegionsBands$Name <- as.character(geneRegionsBands$Name)
  geneRegionsBands$Colors <- as.character(geneRegionsBands$Colors)
  # since the regions are too small to be visualized.. around 1 milion times smaller than the size of chromossome..
  geneRegionsBands$End <- geneRegionsBands$End + 1E6
  return(geneRegionsBands)
}






# TODO
# Could take in consideration the size of regions and
# the whole coverage of the gene (number of amplicons * seqLength)
# > 50 bp
# panelStatistics(genomicRangesFromBed) {
#   # hist(width(ranges(genomicRangesFromBed)))
#   # genomicRangesFromBed$geneNames
#
# }



#   boxplotGeneLevel <- function(allSamplesNormalized,
#                                cohortFilepaths,
#                                referenceFilepathsKNN,
#                                referenceFilepathsPercentil,
#                                geneName,
#                                outputDir = file.path(getwd(),"batchAnalysisNoMininumMappingQuality2")) {
#     #boxplotPerAmplicon <- function(ampliconIndex) {
#     #geneName <- "BRAF"
#     #geneName <- "APC"
#     #  geneName <- "ERBB4"
#
# #    setwd(outputDir)
#
#     #  cohortFilepaths <- filepaths
#     martina <- allSamplesNormalized[, (colnames(allSamplesNormalized) %in% cohortFilepaths)]
#
#     nonMartina <- allSamplesNormalized[, !(colnames(allSamplesNormalized) %in% cohortFilepaths)]
#
#
#     for (geneName in geneNames) {
#
#
#
#       ampliconNames <- rownames(allSamplesNormalized)
#       myAmpliconsIndexes <- grep(geneName, ampliconNames)
#
#       #  pdf(file=paste0(geneName,'_plot.pdf'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5)
#       png(file=paste0(geneName,'_plot.png'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5)
#
#       par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1))
#
#       for (ampliconName in ampliconNames[myAmpliconsIndexes]) {
#         #  ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333"
#         tmp <- data.frame(y = c(as.vector(nonMartina[ampliconName, ]),
#                                 as.vector(nonMartina[ampliconName, referenceFilepathsKNN]),
#                                 as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]),
#                                 as.vector(martina[ampliconName, ])),
#                           x = c(rep("nonMartina",length(as.vector(nonMartina[ampliconName, ]))),
#                                 rep("referenceKNN", length(as.vector(nonMartina[ampliconName, referenceFilepathsKNN]))),
#                                 rep("referencePercentil", length(as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]))),
#                                 rep("martina", length(as.vector(martina[ampliconName, ])))))
#
#         boxplot(y~x, data=tmp, ylab="Read Counts", xlab="Sample set", main = ampliconName)
#         stripchart(y ~ x,
#                    vertical = TRUE,
#                    data = tmp,
#                    method = "jitter",
#                    add = TRUE,
#                    pch = 20,
#                    col = 'blue')
#       }
#       dev.off()
#     }
#   }





# i <- 0
# for (x in 1:100) {
#   i <- i+1
#   if (i %% 10 == 0) {
#     Sys.sleep(1)
#     cat(".")
#   }
# }
#
budczies/CNVPanelizer documentation built on Nov. 20, 2018, 12:24 a.m.