R/core.R

Defines functions .extractAndNormalize .BAMHeaderConsistencyChecks .createSNPhoodObject .initSNPhoodObject .getListOfSupportedParameters .getErrorForOnlyPrepareSamplesCorrelation .checkAndConvertDatasetArgument .checkAndConvertReadGroupArgument .checkAndConvertRegionArgument .extractGenotypesVCF .identifyGenotypeIncompatibilities changeObjectIntegrityChecking associateGenotypes convertToAllelicFractions annotationBins2 deleteDatasets deleteRegions renameRegions renameBins renameDatasets renameReadGroups deleteReadGroups mergeReadGroups annotationBins annotationReadGroups annotationDatasets annotationRegions nReadGroups nDatasets nBins nRegions testForAllelicBiases .getFieldsForBAMParsing .getSNPGRangesObj analyzeSNPhood .constructScanBamFlags .calcRandomBackgroundDistr .calculateOverlapsReads .calculateOverlapsBin .createBins .extractFromBAM .checkConfigFile .calcBinLabelsPlot .getUniqueMappabilityData getDefaultParameterList collectFiles

Documented in analyzeSNPhood annotationBins annotationBins2 annotationDatasets annotationReadGroups annotationRegions associateGenotypes changeObjectIntegrityChecking collectFiles convertToAllelicFractions deleteDatasets deleteReadGroups deleteRegions getDefaultParameterList mergeReadGroups nBins nDatasets nReadGroups nRegions renameBins renameDatasets renameReadGroups renameRegions testForAllelicBiases

#' Helper function to generate a data frame that can be used as input for the function \emph{analyzeSNPhood}
#'
#' \code{collectFiles} creates a data frame that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' The resulting data frame contains information about files that will be processed (column signal) and, optionally,
#' corresponding input files for normalization (column input) and labels to combine datasets to meta-datasets (column individual).
#'
#' Note that if you already have an object of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}}, this can easily be
#' integrated into the \code{SNPhood} framework by using the \code{\link[Rsamtools]{path}} function to specify the
#' value of the parameter \code{patternFiles}, see the examples below.
#'
#' @param patternFiles Character. If vector of length 1, absolute path to one or multiple BAM files that should be processed. Wildcards ("*") are allowed
#' (examples are *CTCF* or *.bam, see also examples). If vector of length > 1, each element must specify the absolute path to a BAM file,
#' with no wildcards being allowed. See also the note above concerning the integration of
#' \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}} objects.
#' For more details, see the examples and the vignette.
#' @param recursive Logical(1). Default FALSE. Should the search for BAM files within the directory be performed recursively?
#' If set to TRUE, all files matching the pattern within the specified directory and all of its subdirectories will be added.
#' If set to FALSE, only files within the specified directory but not any subdirectories will be used.
#' @param ignoreCase Logical(1). Default TRUE. Should the specified pattern be case sensitive?
#' @param inputFiles Character. Default NULL. Input files that should be used as a control for normalization.
#' Supported values are NA (no input normalization), a single character specifying one or multiple input files (comma-separated, see examples)
#' that should be used for all processed files, or a character vector of the same length as the number of files that will be processed.
#' Set to NULL if you want to add the files later manually in the data frame (see vignette).
#' @param individualID Character. Default NULL. Name of the individual IDs. Only relevant if datasets should be pooled.
#' @param genotypeMapping Character. Default NULL. Path to the corresponding genotype file in VCF format, followed by a colon and the name of the column in the VCF file.
#' Genotypes can also be integrated later using the function \code{\link{associateGenotypes}}
#' @template verbose_TRUE
#'
#' @return a data frame with the three columns \code{signal}, \code{input} and \code{individual} that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' @examples
#' ## For brevity, only exemplary filenames are given in the following.
#' ## Note that in reality, absolute paths should be provided.
#' ## First some examples using specific files rather than files that
#' ## match a pattern in a particular directory
#'
#' ## Load SNPhoodData library
#' library(SNPhoodData)
#' files.df = collectFiles(patternFiles = paste0(system.file("extdata", package = "SNPhoodData"),"/*.bam"))
#'
#' ## If you already have BAM files in objects of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}},
#' ## you may use the following code snippet:
#' files = list.files(pattern = "*.bam$",system.file("extdata", package = "SNPhoodData"),full.names = TRUE)
#' BamFile.o = BamFile(files[1])
#' BamFiles.o = BamFileList(files)
#' files.df = collectFiles(patternFiles = path(BamFile.o))
#' files.df = collectFiles(patternFiles = path(BamFiles.o))
#' @seealso \code{\link{analyzeSNPhood}}
#' @export
#' @import checkmate
collectFiles <- function(patternFiles,
                         recursive = FALSE,
                         ignoreCase = TRUE,
                         inputFiles = NA,
                         individualID = NA,
                         genotypeMapping = NA,
                         verbose = TRUE)  {

    # Check types and validity of arguments
    assertCharacter(patternFiles, any.missing = FALSE, min.chars = 1, min.len = 1)
    assertFlag(recursive)
    assertFlag(ignoreCase)
    assert(checkScalarNA(inputFiles),
           checkCharacter(inputFiles,   any.missing = TRUE, all.missing = TRUE, min.chars = 1, min.len = 1))
    assert(checkScalarNA(individualID),
           checkCharacter(individualID, any.missing = TRUE, all.missing = TRUE, min.chars = 1, min.len = 1))
    assertFlag(verbose)


    files.vec = .collectFilesGen(
        patternFiles = patternFiles,
        recursive    = recursive,
        ignoreCase   = ignoreCase,
        verbose      = verbose
    )

    nFilesToProcess = length(files.vec)

    if (nFilesToProcess == 0) {
        return(data.frame(signal = character(),
                          input = character(),
                          individual = character(),
                          genotype = character(),
                          stringsAsFactors = FALSE))
    }


    if (length(inputFiles) != nFilesToProcess & length(inputFiles) != 1) {
        stop("Length of inputFiles vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
    }

    if (length(individualID) != nFilesToProcess & length(individualID) != 1) {
        stop("Length of individualID vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
    }

    if (length(genotypeMapping) != nFilesToProcess & length(genotypeMapping) != 1) {
        stop("Length of genotype vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
    }

    data.frame(signal = files.vec,
               input = inputFiles,
               individual = individualID,
               genotype = genotypeMapping,
               stringsAsFactors = FALSE)

}



#' Helper function to generate a default parameter list as input for the function \emph{analyzeSNPhood}
#'
#' \code{getDefaultParameterList} generates a default parameter list that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' The path to the user regions file can optionally be provided as an argument to the function. See the examples for further details.
#' Before running the function \code{\link{analyzeSNPhood}}, carefully check that the default parameters are suitable for the analysis.
#'
#' @param path_userRegions Character(1). Specify the value of the parameter \code{path_userRegions}
#' (absolute path to the user regions file, see the Vignette for details).
#' @param isPairedEndData Logical(1). Default TRUE. Are the data paired-end (TRUE) or single-end (FALSE)?
#'
#' @return a named list with default values for the currently supported parameters that can be used as
#' input for the function \code{\link{analyzeSNPhood}}:

#' \itemize{
#'  \item{readFlag_isPaired: Logical(1), TRUE for paired-end data, NA for single-end}
#'  \item{readFlag_isProperPair: Logical(1), TRUE}
#'  \item{readFlag_isUnmappedQuery: Logical(1), FALSE}
#'  \item{readFlag_hasUnmappedMate: Logical(1), FALSE}
#'  \item{readFlag_isMinusStrand: Logical(1), NA}
#'  \item{readFlag_isMateMinusStrand: Logical(1), NA}
#'  \item{readFlag_isFirstMateRead: Logical(1), NA}
#'  \item{readFlag_isSecondMateRead: Logical(1), NA}
#'  \item{readFlag_isNotPrimaryRead: Logical(1), FALSE}
#'  \item{readFlag_isNotPassingQualityControls: Logical(1), FALSE}
#'  \item{readFlag_isDuplicate: Logical(1), FALSE}
#'  \item{readFlag_reverseComplement: Logical(1), FALSE}
#'  \item{readFlag_simpleCigar: Logical(1), TRUE}
#'  \item{path_userRegions: Character(1), as given by the function argument path_userRegions}
#'  \item{zeroBasedCoordinates: Logical(1), FALSE}
#'  \item{regionSize: Integer(1), 500}
#'  \item{binSize: Integer(1), 50}
#'  \item{readGroupSpecific: Logical(1), TRUE}
#'  \item{strand: Character(1), "both"}
#'  \item{startOpen: Logical(1), FALSE}
#'  \item{endOpen: Logical(1), FALSE}
#'  \item{headerLine: Logical(1), FALSE}
#'  \item{linesToParse: Integer(1), -1}
#'  \item{lastBinTreatment: Character(1), "delete"}
#'  \item{assemblyVersion: Character(1), "hg19"}
#'  \item{nCores: Integer(1), 1}
#'  \item{keepAllReadCounts: Logical(1), FALSE}
#'  \item{normByInput: Logical(1), FALSE}
#'  \item{normAmongEachOther: Logical(1), TRUE}
#'  \item{poolDatasets: Logical(1), FALSE}
#'
#' }
#' For reasons of reduced redundancy, a detailed description of the parameters can be found at the end of the
#' main vignette in \code{SNPhood} (\code{browseVignettes("SNPhood")}).

#' @examples
#' ## Only one parameter can, optionally, be specified when calling the function
#' par.l = getDefaultParameterList(path_userRegions = "path/to/regions", isPairedEndData = TRUE)
#' ## If the file is not specified, you need to change it
#' ## before you can execute the function \code{\link{analyzeSNPhood}}
#' par.l = getDefaultParameterList(isPairedEndData = TRUE)
#' par.l$path_userRegions = "path/to/regions"

#' @seealso \code{\link{analyzeSNPhood}}
#' @export
#' @import checkmate
getDefaultParameterList  <-  function(path_userRegions = NULL,
                                      isPairedEndData = TRUE) {

    assert(checkNull(path_userRegions),
           checkCharacter(path_userRegions, any.missing = FALSE, min.chars = 1, len = 1)
    )
    assertFlag(isPairedEndData)

    readFlags.l = .generateDefaultReadFlags(pairedEndReads = isPairedEndData)

    others.l = list(
        readFlag_reverseComplement           = FALSE,
        readFlag_simpleCigar                 = TRUE,
        readFlag_minMapQ                     = 0,
        path_userRegions                     = path_userRegions,
        zeroBasedCoordinates                 = FALSE,
        regionSize                           = 500,
        binSize                              = 50,
        readGroupSpecific                    = TRUE,
        strand                               = "both",
        startOpen                            = FALSE,
        endOpen                              = FALSE,
        headerLine                           = FALSE,
        linesToParse                         = -1,
        lastBinTreatment                     = "delete",
        assemblyVersion                      = "hg19",
        effectiveGenomeSizePercentage        = -1,
        nCores                               = 1,
        keepAllReadCounts                    = FALSE,
        normByInput                          = FALSE,
        normAmongEachOther                   = TRUE,
        poolDatasets                         = FALSE
    )

    c(readFlags.l, others.l)
}

.getUniqueMappabilityData <- function(genomeAssembly,
                                      readLength,
                                      verbose = TRUE) {

    assertSubset(genomeAssembly, c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3"))
    assertInt(readLength, lower = 15)
    assertFlag(verbose)

    # Taken from Table  in Koehler, Ryan, et al. "The uniqueome: a mappability resource for short-tag sequencing." Bioinformatics 27.2 (2011): 272-274.
    # For different assemblies, values are assumed to be identical

    readLength.vec = c(25, 30, 35, 50, 60, 75, 90)

    uniqueMappability.df = data.frame( "25" = c(rep(0.66,3), rep(0.699,2), 0.675),
                                       "30" = c(rep(0.709,3), rep(0.744,2), 0.684),
                                       "35" = c(rep(0.741,3), rep(0.771,2), 0.69),
                                       "50" = c(rep(0.769,3), rep(0.791,2), 0.692),
                                       "60" = c(rep(0.775,3), rep(0.794,2), 0.692),
                                       "75" = c(rep(0.793,3), rep(0.807,2), 0.695),
                                       "90" = c(rep(0.808,3), rep(0.817,2), 0.698),
                                       stringsAsFactors = FALSE)

    rownames(uniqueMappability.df) = c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3")

    # Get the read length that is closest to the real one
    bestApprox = readLength.vec[which(abs(readLength.vec - readLength) == min(abs(readLength.vec - readLength)))][1]

    if (verbose) message("  Read length from pre-computed table that most closely resembles observed read length: ", bestApprox)
    if (abs(bestApprox - readLength) > 20) {
        warning("Difference between observed read length and best match in the pre-computed table is bigger than 20bp. You may want to set the value of the parameter effectiveGenomeSizePercentage manually for increased accuracy with respect to the global genome average that is used for input normalization.")
    }

    res = uniqueMappability.df[which(rownames(uniqueMappability.df) == genomeAssembly), which(colnames(uniqueMappability.df) == paste0("X", bestApprox))]

    if (verbose) message("  Value for the effective genome size: ", res)

    res
}

#' @import checkmate
.calcBinLabelsPlot <- function(nBins) {

    binLabels = seq(-floor(nBins/2), floor(nBins/2), 1)

    # Current design decision:
    # Odd number of bins (e.g, 7)
    # -3 -2 -1 0 1 2 3
    # Even number of bins (e.g, 6)
    # -3 -2 -1   1 2 3
    if (nBins %% 2 == 0) { # Delete the 0
        binLabels = binLabels[-which(binLabels == 0)]
    }

    names(binLabels) = 1:nBins

    binLabels
}



###################################
###################################
# PARSING AND CHECKING  FUNCTIONS #
###################################
###################################

#' @import checkmate
#' @importFrom BiocParallel multicoreWorkers
.checkConfigFile <- function(par.l,
                             verbose = TRUE) {


    supportedPar.l = getDefaultParameterList(NULL)
    # Check types and validity of arguments
    assertList(par.l, min.len = 1, all.missing = FALSE)

    # Check if all parameters are present
    assertSubset(names(supportedPar.l), names(par.l))

    # Check if there are any additional parameters defined that are NOT supported. If yes, abort to make sure the user didn't unintentionally make a mistake
    if (!testSubset(names(par.l), names(supportedPar.l))) {
        addPar = setdiff(names(par.l), names(supportedPar.l))

        stop("The following parameters are defined in the parameter list but are not supported (set them to NULL to delete them): \n ", paste(addPar, collapse = ","))
    }

    pars.df = .getParameters(.getListOfSupportedParameters())

    parsRequired.vec = pars.df$parName

    parsRequiredTypes.vec = pars.df$parType

    names(parsRequiredTypes.vec) = parsRequired.vec

    # Check if any of the required parameters are missing or if parameters that are not supported are included
    parsNotSupported = as.character(names(par.l)[which(!pars.df$parName %in% names(par.l))])
    parsNotFound     = as.character(pars.df$parName[which(!(pars.df$parName %in% names(par.l)))])

    if (length(which(is.na(parsNotSupported))) > 0) {
        parsNotSupported =  parsNotSupported[-which(is.na(parsNotSupported))]
    }

    if (length(which(is.na(parsNotFound))) > 0) {
        parsNotFound     =  parsNotFound[-which(is.na(parsNotFound))]
    }


    if (length(parsNotSupported) > 0) {
        warning(paste("The following parameters are not supported and have been ignored: ", paste(parsNotSupported, collapse = ", "), sep = ""))
    }

    if (length(parsNotFound) > 0) {
        stop(paste("The following mandatory parameters have not been found: ", paste( parsNotFound, collapse = ", "), sep = ""))
    }


    # Check that all parameters are valid
    assert(checkInt(par.l$linesToParse, lower = -1, upper = -1),
           checkInt(par.l$linesToParse, lower = 1))
    assertIntegerish(par.l$binSize, lower = 1, any.missing = FALSE, len = 1)
    assertIntegerish(par.l$regionSize, lower = 1, any.missing = FALSE, len = 1)
    assertIntegerish(par.l$nCores, lower = 1, any.missing = FALSE, len = 1)
    assertCharacter(par.l$path_userRegions, any.missing = FALSE, len = 1)
    assertChoice(par.l$assemblyVersion, c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3", "sacCer3", "sacCer2"))
    assertNumber(par.l$effectiveGenomeSizePercentage, lower = -1, upper = 1)

    assertFlag(par.l$keepAllReadCounts)

    assertFlag(par.l$readFlag_isPaired, na.ok = TRUE)
    assertFlag(par.l$readFlag_isProperPair, na.ok = TRUE)
    assertFlag(par.l$readFlag_isUnmappedQuery, na.ok = TRUE)
    assertFlag(par.l$readFlag_hasUnmappedMate, na.ok = TRUE)
    assertFlag(par.l$readFlag_isMinusStrand, na.ok = TRUE)
    assertFlag(par.l$readFlag_isMateMinusStrand, na.ok = TRUE)
    assertFlag(par.l$readFlag_isFirstMateRead, na.ok = TRUE)
    assertFlag(par.l$readFlag_isSecondMateRead, na.ok = TRUE)
    assertFlag(par.l$readFlag_isNotPrimaryRead, na.ok = TRUE)
    assertFlag(par.l$readFlag_isNotPassingQualityControls, na.ok = TRUE)
    assertFlag(par.l$readFlag_isDuplicate, na.ok = TRUE)
    assertFlag(par.l$readFlag_reverseComplement, na.ok = TRUE)
    assertFlag(par.l$readFlag_simpleCigar, na.ok = TRUE)
    assertInt(par.l$readFlag_minMapQ, lower = 0)

    assertFlag(par.l$zeroBasedCoordinates)
    assertFlag(par.l$readGroupSpecific)
    assertFlag(par.l$startOpen)
    assertFlag(par.l$endOpen)
    assertFlag(par.l$headerLine)
    assertFlag(par.l$normByInput)
    assertFlag(par.l$normAmongEachOther)
    assertFlag(par.l$poolDatasets)
    assertChoice(par.l$strand, c("sense", "antisense", "both"))
    assertChoice(par.l$lastBinTreatment, c("keepUnchanged", "delete", "extend"))

    if (par.l$binSize > (2* par.l$regionSize + 1)) {
        stop("Value of parameter binSize not valid: Must be in the interval [1, 2* regionSize + 1].")
    }

    if (!par.l$readFlag_simpleCigar) {
        stop("The parameter \"readFlag_simpleCigar\" must currently be set to TRUE in order to retrieve genotype information from the reads.")
    }


    if (par.l$nCores > multicoreWorkers()) {
        warning("Requested ", par.l$nCores, " CPUs, but only ", multicoreWorkers(), " are available and can be used.")
        par.l$nCores = multicoreWorkers()
    }


    if (verbose) message("SUCCESSFULLY FINISHED PARSING AND CHECKING THE CONFIGURATION FILE. PARSED PARAMETERS:")

    for (i in seq_len(length(par.l))) {
        if (verbose) message(" Parameter \"", names(par.l)[i],"\": \"", par.l[[i]],"\"")
    }


    par.l

}

# Although BAM is zero-based internally, samtools and RSamtools use 1-based positions.0-based coordinate system:
# A coordinate system where the first base of a sequence is zero. In this coordinate system, a region is specified by a half-closed-half-open interval.
# For example, the region between the 3rd and the 7th bases inclusive is [2 ; 7). The BAM, BCFv2, BED, and PSL formats are using the 0-based
# coordinate system

#' @import Rsamtools
# @importFrom Rsamtools scanBamWhat
#' @import checkmate
.extractFromBAM  <-  function(ranges.gr,
                              file,
                              fieldsToRetrieve,
                              par.l,
                              verbose = TRUE) {

    # Check types and validity of arguments
    assertClass(ranges.gr, "GRanges")
    assertFileExists(file, access = "r")
    assertSubset(fieldsToRetrieve, scanBamWhat(), empty.ok = FALSE)

    assertList(par.l, min.len = 1, all.missing = FALSE)
    assertFlag(verbose)

    .extractFromBAMGen(regions.gr = ranges.gr,
                       file = file,
                       fieldsToRetrieve = fieldsToRetrieve,
                       flags = .constructScanBamFlags(par.l),
                       readGroupSpecificity = par.l$readGroupSpecific,
                       simpleCigar = par.l$readFlag_simpleCigar,
                       reverseComplement = par.l$readFlag_reverseComplement,
                       minMapQ = par.l$readFlag_minMapQ,
                       verbose = verbose)
}

# @importFrom GenomicRanges start end strand seqnames
#' @import GenomicRanges
#' @import checkmate
.createBins  <-  function(userRegions.gr,
                          par.l,
                          applyLastBinStrategy = TRUE,
                          chrSizes.df,
                          verbose = TRUE) {

    # Check types and validity of arguments
    assertClass(userRegions.gr, "GRanges")
    assertList(par.l, any.missing = FALSE, min.len = 1)
    assertFlag(applyLastBinStrategy)
    assertDataFrame(chrSizes.df, min.rows = 1, min.cols = 2, col.names = NULL)
    assertFlag(verbose)

    # Bin each region according to the binSize parameter. Start from the leftmost position and continue to the right
    binSize  = par.l$binSize
    nRegionsCur = length(userRegions.gr)


    # Obtain the number of reads for each bin
    .createStartSeq  <-  function(x, userRegions.gr, binSize) {
        seq.int(start(userRegions.gr)[x], end(userRegions.gr)[x], binSize)
    }

    .createEndSeq  <-  function(x, userRegions.gr, binSize) {
        seq.int(start(userRegions.gr)[x], end(userRegions.gr)[x], binSize) + binSize - 1
    }


    # Check here: https://support.bioconductor.org/p/66050/

    # Parallelize functions
    binStarts = .execInParallelGen(nCores = par.l$nCores,
                                   returnAsList = TRUE,
                                   listNames = NULL,
                                   iteration = seq_len(nRegionsCur),
                                   abortIfErrorParallel = TRUE,
                                   verbose = FALSE,
                                   functionName = .createStartSeq, userRegions.gr, binSize)

    binEnds   = .execInParallelGen(nCores = par.l$nCores,
                                   returnAsList = TRUE,
                                   listNames = NULL,
                                   iteration = seq_len(nRegionsCur),
                                   abortIfErrorParallel = TRUE,
                                   verbose = FALSE,
                                   functionName = .createEndSeq, userRegions.gr, binSize)

    nBinsPerRegion.vec = sapply(binEnds, length)

    # Adjust binEnds: The last row may contain ends that are larger than the actual ends due to potentially shorter last bins
    # Because we modify an existing object within a function, use the global < <-  operator.
    # However, DO NOT USE THE  <- - in a parallel environment, is does not work! That's why we use a regular for loop here

    for (i in seq_len(nRegionsCur)) {
        index = nBinsPerRegion.vec[i]
        binEnds[[i]][index] = min(binEnds[[i]][index], end(userRegions.gr)[i])
    }

    .funLength  <-  function(x, nBinsPerRegion.vec) {rep.int(x, nBinsPerRegion.vec[x])}
    .funStrand  <-  function(x, userRegions.gr, nBinsPerRegion.vec) {rep(as.character(strand(userRegions.gr)[x]), nBinsPerRegion.vec[x])}
    .funChr     <-  function(x, userRegions.gr, nBinsPerRegion.vec) {rep(as.character(seqnames(userRegions.gr)[x]), nBinsPerRegion.vec[x])}

    bins.l = list()

    bins.l$chr = .execInParallelGen(nCores = par.l$nCores,
                                    returnAsList = FALSE,
                                    listNames = NULL,
                                    iteration = seq_len(nRegionsCur),
                                    abortIfErrorParallel = TRUE,
                                    verbose = FALSE,
                                    functionName = .funChr, userRegions.gr, nBinsPerRegion.vec)


    bins.l$start                = unlist(binStarts, use.names = FALSE)
    bins.l$end                  = unlist(binEnds, use.names = FALSE)

    bins.l$strand = .execInParallelGen(nCores = par.l$nCores,
                                       returnAsList = FALSE,
                                       listNames = NULL,
                                       iteration = seq_len(nRegionsCur),
                                       abortIfErrorParallel = TRUE,
                                       verbose = FALSE,
                                       functionName = .funStrand, userRegions.gr, nBinsPerRegion.vec)


    bins.l$origRegionIndex = .execInParallelGen(nCores = par.l$nCores,
                                                returnAsList = FALSE,
                                                listNames = NULL,
                                                iteration = seq_len(nRegionsCur),
                                                abortIfErrorParallel = TRUE,
                                                verbose = FALSE,
                                                functionName = .funLength, nBinsPerRegion.vec)


    bins.l$length               = bins.l$end  - bins.l$start + 1


    # If the strand information is discarded, change strand to *
    if (par.l$strand  ==  "both") {
        bins.l$strand = rep("*", length(bins.l$strand))
    } else if (par.l$strand  ==  "antisense") {

        bins.l$strand = gsub("+",".", bins.l$strand, fixed = TRUE)
        bins.l$strand = gsub("-","+", bins.l$strand, fixed = TRUE)
        bins.l$strand = gsub(".","-", bins.l$strand, fixed = TRUE)

    }

    if (applyLastBinStrategy) {

        # The last bin may be shorter: Option to delete it, extend it beyond the requested boundaries or leave it as it is
        if (par.l$lastBinTreatment  ==  "keepUnchanged") {

            if (verbose) message(" The last bin in each region will be left untouched.")

        } else if (par.l$lastBinTreatment  ==  "delete") { # original user regions remain unchanged, because the last bin is deleted and not considered although the reads mapping there were already retrieved

            indexToDeleteSignal = which(bins.l$length < binSize )

            if (length(indexToDeleteSignal) > 0) {

                if (verbose) message(" The last bin in each region will be deleted as they are shorter than the other bins (",
                                     paste0(unique(bins.l$length[indexToDeleteSignal]), collapse = ","), " bp as compared to ", binSize, " bp).")

                bins.l$chr             = bins.l$chr            [-indexToDeleteSignal]
                bins.l$start           = bins.l$start          [-indexToDeleteSignal]
                bins.l$end             = bins.l$end            [-indexToDeleteSignal]
                bins.l$strand          = bins.l$strand         [-indexToDeleteSignal]
                bins.l$origRegionIndex = bins.l$origRegionIndex[-indexToDeleteSignal]
                bins.l$length          = bins.l$length         [-indexToDeleteSignal]

                #Each index that is deleted belongs to one region. Thus, the number of bins has to be adjusted in the nBinsPerRegion.vec vector
                .funBinsPerRegion  <-  function(x, bins.l, nRegionsCur) {
                    length(which(bins.l$origRegionIndex  ==  seq_len(nRegionsCur)[x]))
                }

                nBinsPerRegion.vec = .execInParallelGen(nCores = par.l$nCores,
                                                        returnAsList = FALSE,
                                                        listNames = NULL,
                                                        iteration = seq_len(nRegionsCur),
                                                        abortIfErrorParallel = TRUE,
                                                        verbose = FALSE,
                                                        functionName = .funBinsPerRegion, bins.l, nRegionsCur)

                # Adjust the user regions end coordinates accordingly
                nBins = unique(table(bins.l$origRegionIndex))
                assertIntegerish(nBins, len = 1)
                end(userRegions.gr) = start(userRegions.gr) + (nBins * par.l$binSize) - 1
            }

        } else if (par.l$lastBinTreatment  ==  "extend") { # In this particular case, the user regions have to be adapted in order to also retrieve the reads from the extended region associated with the last bin

            indexToExtend = which(bins.l$length < binSize )

            if (length(indexToExtend ) > 0) {
                if (verbose) message(" The last bin in each region will be extended outside of the specified region to the length of the other bins.")

                bins.l$end[indexToExtend]              = bins.l$end[indexToExtend] + (binSize - bins.l$length[indexToExtend])
                bins.l$length[indexToExtend]           = bins.l$end[indexToExtend]  - bins.l$start[indexToExtend] + 1

                # Check if chromosome boundaries may be crossed due to the extension
                indexBoundary = which(bins.l$end > chrSizes.df[bins.l$chr,]$size)

                if (length(indexBoundary) > 0) {

                    if (verbose) message(" WARNING: Due to the extension of the last bin, at least one bin crossed the chromosome end and had to be adjusted to the last position of the chromosome.")
                    chrSizes.df = .getGenomeData(par.l$assemblyVersion)
                    bins.l$end[indexBoundary] = chrSizes.df[bins.l$chr[indexBoundary],]$size

                }

                # Extend the user regions accordingly
                end(userRegions.gr)[bins.l$origRegionIndex[indexToExtend]]    = bins.l$end[indexToExtend]

            }

        }

    } # end applyLastBinStrategy

    list("bins" = bins.l, "userRegions" = userRegions.gr)

}


#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges countOverlaps
#' @importFrom IRanges  IRangesList IRanges
.calculateOverlapsBin  <-  function(bamObj,
                                    par.l,
                                    bins.l,
                                    nOverlapsRegion.vec,
                                    verbose) {

    # Check types and validity of arguments
    assertList(bamObj, any.missing = FALSE, min.len = 1)
    assertList(par.l, min.len = 1, all.missing = FALSE)
    assertList(bins.l, min.len = 4, all.missing = FALSE)
    assertIntegerish(nOverlapsRegion.vec, lower = 0, any.missing = FALSE, min.len = 1)


    nBinsPerRegion.vec  = table(bins.l$origRegionIndex)
    nColumnsMax         = max(nBinsPerRegion.vec)
    nRegionsCur            = length(nOverlapsRegion.vec)
    # Create a matrix. Rownames = IDs of regions. Colnames: Bins

    results.m           = matrix(0, nrow = nRegionsCur , ncol = nColumnsMax)

    # If we have only one bin, save some computational time
    if (nColumnsMax  ==  1) {

        results.m[,1] = nOverlapsRegion.vec

        return(results.m)
    }

    # Do all the hard work. Compare the bin from each region with the reads originating from that region
    # Use apply to do it once per original region to reduce the necessary comparisons appropriately so the resulting list contains only what is needed
    # There are two possible options: One with IRanges and one with GRanges. The former is faster because there is no sequence and strand information involved.
    # The sequence names are identical among all ranges so this can be ignored here. The strand is covered elsewhere also.
    # New version of Bioconductor makes this operation much faster: 3 to 10x. See https://stat.ethz.ch/pipermail/bioc-devel/2014-December/006749.html and check if a code change is needed

    .calcOverlapsBins  <-  function(x, bins.l, bamObj) {
        index = which(bins.l$origRegionIndex  ==  x)
        countOverlaps(IRangesList(start = bins.l$start[index], end = bins.l$end[index]),
                      IRangesList(IRanges(start = bamObj[[x]]$pos, width = bamObj[[x]]$qwidth)))
    }

    # Make faster and only calculate overlaps if the region count in nOverlapsRegion.vec is > 0
    indexes.vec = setdiff(seq_len(nRegionsCur), which(nOverlapsRegion.vec  ==  0))

    if (length(indexes.vec) > 0) {

        # Compute in parallel if possible
        results.l = .execInParallelGen(nCores = par.l$nCores,
                                       returnAsList = TRUE,
                                       listNames = NULL,
                                       iteration = indexes.vec,
                                       abortIfErrorParallel = TRUE,
                                       verbose = FALSE,
                                       functionName = .calcOverlapsBins, bins.l, bamObj)


        # Slower but uses GRanges instead of IRanges
        #   results.l = sapply(seq_len(nRegionsCur), FUN = function(x) {index=which(bins.l$origRegionIndex  ==  x);
        #     countOverlaps(GRanges(seqnames = bins.l$chr[index],    ranges = IRanges(start=bins.l$start[index], end=bins.l$end[index]), strand= bins.l$strand[index]),
        #                   GRanges(seqnames= bamSignal_cur[[x]]$rname, ranges = IRanges(start=bamSignal_cur[[x]]$pos, width=bamSignal_cur[[x]]$qwidth)))
        #   })
        # nReadCounts.vec = unlist(results.l)


        # Unlist, level 1

        # Make sure the length of the individual elements is always one
        length.vec = sapply(results.l, length)

        assertSetEqual(length(unique(length.vec)),1)

        nReadCounts.vec = .execInParallelGen(nCores = par.l$nCores,
                                             returnAsList = FALSE,
                                             listNames = NULL,
                                             iteration = results.l,
                                             abortIfErrorParallel = TRUE,
                                             verbose = FALSE,
                                             functionName =  match.fun("[["), 1)


        # Save the read counts into the matrix
        # A parallelized version caused a bug before, so let's switch back to the original version. Since nRegionsCur is not so large, this does not cost much execution time
        for (x in indexes.vec) {

            # Make sure that the startIndex is set correctly and that no gaps form when some rows are skipped
            indexVec = which(indexes.vec  ==  x)
            startIndex = sum(nBinsPerRegion.vec[seq_len(indexVec - 1)]) + 1
            results.m[x,1:nBinsPerRegion.vec[x]] = nReadCounts.vec[startIndex:(startIndex + nBinsPerRegion.vec[indexVec] - 1)]
        }
    } else {# No overlaps at all
        # Nothing to do here, maybe except a warning
    }

    results.m

}

#' @importFrom IRanges as.matrix IRanges
#' @import checkmate
#' @import GenomicRanges
#' @importFrom S4Vectors as.table
.calculateOverlapsReads  <-  function(startPos.l,
                                      width.l,
                                      par.l,
                                      bins.l,
                                      nOverlapsRegion.vec,
                                      verbose) {

    # Check types and validity of arguments
    assertList(startPos.l, any.missing = FALSE, len = length(nOverlapsRegion.vec))
    assertList(width.l, any.missing = FALSE, len = length(nOverlapsRegion.vec))
    assertList(par.l, min.len = 1, all.missing = FALSE)
    assertList(bins.l, min.len = 4, all.missing = FALSE)
    assertIntegerish(nOverlapsRegion.vec, lower = 0, any.missing = FALSE, min.len = 1)


    nBinsPerRegion.vec  = table(bins.l$origRegionIndex)
    nColumnsMax         = max(nBinsPerRegion.vec)
    nRegionsCur         = length(nOverlapsRegion.vec)
    # Create a matrix. Rownames = IDs of regions. Colnames: Bins

    results.l           =  list(start  = vector( "list", nRegionsCur),
                                end  = vector( "list", nRegionsCur))

    # If we have only one bin, save some computational time
    if (nColumnsMax  ==  1) {

        # All reads overlap with only bin 1, start and end
        results.l = lapply(1:nRegionsCur,
                           function(x) list(start = rep(1, nOverlapsRegion.vec[x]), end = rep(1, nOverlapsRegion.vec[x])))

        # Transform to a slightly different nested format and return
        return(list(start = lapply(results.l, "[[", "start"), end = lapply(results.l, "[[", "end")))

    }

    # Do all the hard work. See .calculateOverlapsBins, just switched around
    .calcOverlapsReads  <-  function(x, bins.l, startPos.l, width.l) {
        index = which(bins.l$origRegionIndex  ==  x)

        res2 = findOverlaps(IRanges(start = startPos.l[[x]], width = width.l[[x]]),
                            IRanges(start = bins.l$start[index], end = bins.l$end[index]),
                            type = "any")

        hitsList.m = as.matrix(res2)

        # Grep the start bins from the subjectHits column
        start = as.numeric(hitsList.m[which(!duplicated(hitsList.m[,1])),2])

        # Now use a trick to extract the end column: Use as.table to convert to a table that lists for each query how often it overlapped.
        # Because overlaps in a particular user region must be, by definition, adjacent, the end equals the start + length -1
        end   = as.numeric(start + (as.numeric(as.table(res2)) - 1))

        stopifnot(length(start) == length(end))

        # Old: Too memory consuming because a lot of regions have a low number of reads
        #RleList(list(start=start, end=end), compress = TRUE)
        list(start = start, end = end)

    }

    # Make faster and only calculate overlaps if the region count in nOverlapsRegion.vec is > 0
    indexes.vec = setdiff(seq_len(nRegionsCur), which(nOverlapsRegion.vec  ==  0))


    if (length(indexes.vec) > 0) {

        # Compute in parallel if possible
        resultsTmp.l  = .execInParallelGen(nCores = par.l$nCores,
                                           returnAsList = TRUE,
                                           listNames = NULL,
                                           iteration = indexes.vec,
                                           abortIfErrorParallel = TRUE,
                                           verbose = FALSE,
                                           functionName = .calcOverlapsReads, bins.l, startPos.l, width.l)

        # Get back the correct order because some 0 regions may have been skipped
        for (i in seq_len(length(resultsTmp.l))) {
            results.l[["start"]][[indexes.vec[i]]] = resultsTmp.l[[i]]$start
            results.l[["end"]]  [[indexes.vec[i]]] = resultsTmp.l[[i]]$end
        }
        rm(resultsTmp.l)


    } else {# No overlaps at all
        stop("No overlaps found. Abort.")
    }

    results.l
}

#' @import checkmate
.calcRandomBackgroundDistr <- function(nRegions,
                                       nBins,
                                       readOverlaps.l) {

    assertInt(nRegions, lower = 1)
    assertInt(nBins, lower = 1)
    assertList(readOverlaps.l)

    res.l = list()
    for (alleleCur in names(readOverlaps.l)) {
        res.l[[alleleCur]] = matrix(0, nrow = nRegions, ncol = nBins)
    }
    # Loop through both alleles and all regions, and randomly reassign each read to one of the two alleles

    # If the number of bins is 1, the result of this function will always be the same because alleles are chosen 50%50 and
    # there are no other overlapping bins so no variation is possible

    stopifnot(nRegions == length(readOverlaps.l[[1]][["start"]]))

    for (alleleCur in names(readOverlaps.l)) {

        nReads = sapply(readOverlaps.l[[alleleCur]][["start"]], length)

        # Assign each read to read group 1 initially
        rnd.l = sapply(1:nRegions, function(x) rep(1, nReads[x]))


        for (regionCur in which(nReads > 0)) {

            # Randomly assign each read to one of the two alleles, named 1 and 2 hereafter for simplicity. Make sure that the distribution is 50/50
            rnd.l[[regionCur]][sample(1:nReads[regionCur], floor(nReads[regionCur] / 2), replace = FALSE)] = 2

            rndAlleles = names(readOverlaps.l)[rnd.l[[regionCur]]]
            binStarts = as.numeric(readOverlaps.l[[alleleCur]] [["start"]] [[regionCur]] )
            binEnds   = as.numeric(readOverlaps.l[[alleleCur]] [["end"]]   [[regionCur]] )

            # For each read, increase the randomly assigned allele assignment at the particular bins the read overlaps with
            for (indexRead in seq_len(nReads[regionCur])) {
                cols = binStarts[indexRead]:binEnds[indexRead]
                indexRead = rndAlleles[indexRead]
                res.l[[indexRead]] [regionCur,  cols] = res.l[[indexRead]] [regionCur,  cols] + 1
            }

        }

    } # end for (alleleCur in names(readOverlaps.l))

    res.l

}

#' @import checkmate
.constructScanBamFlags  <-  function(par.l) {

    assertList(par.l, min.len = 1, all.missing = FALSE)

    .constructScanBamFlagsGen(isPaired                   = par.l$readFlag_isPaired,
                              isProperPair                = par.l$readFlag_isProperPair,
                              isUnmappedQuery             = par.l$readFlag_isUnmappedQuery,
                              hasUnmappedMate             = par.l$readFlag_hasUnmappedMate,
                              isMinusStrand               = par.l$readFlag_isMinusStrand,
                              isMateMinusStrand           = par.l$readFlag_isMateMinusStrand,
                              isFirstMateRead             = par.l$readFlag_isFirstMateRead,
                              isSecondMateRead            = par.l$readFlag_isSecondMateRead,
                              isSecondaryAlignment        = par.l$readFlag_isNotPrimaryRead,
                              isNotPassingQualityControls = par.l$readFlag_isNotPassingQualityControls,
                              isDuplicate                 = par.l$readFlag_isDuplicate
    )
}


#' Main function of \emph{SNPhood}
#'
#' \code{analyzeSNPhood} is the main function of the \code{SNPhood} package.
#' All results, parameters and metadata are stored in an object of class \code{\linkS4class{SNPhood}}.
#'
#' \strong{If you already have BAM files in objects of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}},
#' see the function \code{\link{collectFiles}} for how to seemlessly integrate them into the \code{SNPhood} framework.}
#'
#' In addition, see the vignettes for more details.
#'
#' @param par.l Named list. Named list with all required parameter names and their respective values, which
#' should be generated via the helper function \code{\link{getDefaultParameterList}}.
#' Note that all supported parameters must be defined in the list, as obtained by the function \code{\link{getDefaultParameterList}} .
#' See also ?getDefaultParameterList for details.
#' @param files.df Data frame with at least the column "signal" specifying the absolute paths to the BAM files that will be processed.
#' Optionally, further columns can be added.
#' Supported are "input", "individual" and "genotype". See the Vignette for further details.
#' The data frame can either be created manually or via the helper function \code{\link{collectFiles}}.
#' @param onlyPrepareForDatasetCorrelation Logical(1). Default FALSE. If set to TRUE, only steps necessary to analyze
#' the correlation among datasets with respect to their read counts are calculated, which is less thsan time-consuming than running the full pipeline.
#' This is a quality control step to identify outlier datasets
#' that show artefacts and that should therefore be removed from the analysis. If set to FALSE (the default), the full pipeline is
#' executed. In both cases, the function \code{\link{plotAndCalculateCorrelationDatasets}} can be executed afterwards.
#' @template verbose_TRUE
#'
#' @return Object of class \code{\linkS4class{SNPhood}}. See the class description (?"SNPhood-class", or click the link) for details.
#' @examples
#' ## For the following example, see also the workflow vignette!
#' library(SNPhoodData)
#' # get a list of files to process
#' dataDir = system.file("extdata", package = "SNPhoodData")
#' files.df = collectFiles(patternFiles = paste0(dataDir,"/*.bam"))
#' files.df$individual = c("GM10847", "GM10847", "GM12890", "GM12890")
#' fileUserRegions = list.files(pattern = "*.txt",dataDir, full.names = TRUE)
#' par.l = getDefaultParameterList(path_userRegions = fileUserRegions)
#' par.l$poolDatasets = TRUE

#' # Run the main function with the full pipeline
#' SNPhood.o = analyzeSNPhood (par.l, files.df)
#' @export
#' @import checkmate
#' @import Rsamtools
#  @importFrom Rsamtools
analyzeSNPhood  <-  function(par.l,
                             files.df,
                             onlyPrepareForDatasetCorrelation = FALSE,
                             verbose = TRUE) {

    # Check types and validity of arguments
    if (!testList(par.l, all.missing = FALSE, len = length(getDefaultParameterList("/path/to/userRegions.example")))) {
        stop("The provided argument par.l is not valid. It must be a named list of length ",
             length(getDefaultParameterList("test")),
             ". The names of the list must be the names of the required parameters. See ?analyzeSNPhood, ?getDefaultParameterList and the vignettes for details.")
    }

    # memory.profile()
    if (verbose) message("Total size of all objects: ", round(sum(memory.profile()) / (1024*1024),1), " Mb")

    assertDataFrame(files.df, min.rows = 1, min.cols = 1)
    assertFlag(onlyPrepareForDatasetCorrelation)
    assertFlag(verbose)

    start.time  <-  Sys.time()

    if (verbose) message("\n\nSTART WITH AUTOMATED PIPELINE\n")
    if (verbose) message("\nThe following arguments have been provided:")

    ##########################################################
    # Check the configuration file and files to be processed #
    ##########################################################

    par.l = .checkConfigFile(par.l, verbose)

    # Make sure that the files data frame does not contain any factors
    files.df[] <- lapply(files.df, as.character)

    files.df$input[which(files.df$input  ==  "")] = NA

    # Remove spaces at the beginning and the end and after commas as well as newline characters
    files.df$input = gsub("[\r\n]", "", files.df$input)
    files.df$input = sub('^\\s+','' , files.df$input, perl = TRUE)
    files.df$input = sub(',\\s+',',', files.df$input, perl = TRUE)


    if (par.l$normByInput & par.l$readGroupSpecific) {
        warning("Input normalization has been turned off because read-group specific reads have been requested. Currently, input normalization is not possible with read-group specific reads.")
        files.df$input = NA
        par.l$normByInput = FALSE
    }

    if (par.l$normByInput & par.l$effectiveGenomeSizePercentage == -1 & !par.l$assemblyVersion %in% c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3")) {
        stop("For the specified genome assembly, SNPhood does not have default values for the parameter effectiveGenomeSizePercentage. Please adjust the value manually. See the vignette or help pages for details.")
    }

    if (par.l$normByInput & any(is.na(files.df$input))) {
        stop("Input normalization has been enabled, but at least one file has no corresponding input file as a control.")
    }

    if (!par.l$normByInput & any(!is.na(files.df$input))) {
        stop("Input normalization has been disabled, but filenames for input files have been provided. Either enable input normalization or set input files to NA")
    }


    # Make sure the order of appearance does not play a role in files.df$input
    if (!any(is.na(files.df$input))) {
        input.split  = strsplit(files.df$input, ",", fixed = TRUE)
        input.split.length = sapply(input.split, length)

        for (i in which(input.split.length > 1)) {
            files.df$input[i] = paste(sort(input.split[[i]]), collapse = ",")
        }
    }


    # Not possible to use the same signal files multiple times.
    assertSetEqual(length(unique(files.df$signal)), length(files.df$signal))

    if (par.l$poolDatasets) {
        if (!"individual" %in% colnames(files.df)) stop("The parameter \"poolDatasets\" is set to TRUE but the column \"individual\" was not found in data frame files.df." )

        # Check if all files belonging to the same individual have identical input files
        for (individual in unique(files.df$individual)) {
            if (length(unique(files.df[which(files.df$individual  ==  individual),]$input)) > 1) {
                stop("Different input files are assigned across all individuals with ID \"", individual,"\". Verify that the set of input files is identical among data sets with the same individual ID", sep = "")
            }
        }

    } else {
        files.df$individual = files.df$signal  # Overwrite custom individual IDs to make sure we don't store redundant information (signal file and the corresponding ID) in the final object
    }

    nFilesToProcess = nrow(files.df)
    nIndividualsToProcess = length(unique(files.df$individual))

    # Make sure the normalization among the different data files is only performed when necessary und useful
    if (par.l$normByInput | nFilesToProcess  ==  1 | par.l$readGroupSpecific) {
        if (par.l$normAmongEachOther) warning("Forcing parameter normAmongEachOther to FALSE because either input normalization is turned on, only one files is going to be processed, or because allele-specific reads are requested")
        par.l$normAmongEachOther = FALSE
    }

    ###############################
    # Check genotype integration #
    ###############################

    integrateGenotype = TRUE

    if (!"genotype" %in% colnames(files.df)) {
        integrateGenotype = FALSE
        files.df$genotype = NA
        files.df$genotype = as.character(files.df$genotype)
    } else {
        if (all(is.na(files.df$genotype))) {
            integrateGenotype = FALSE
        } else {

            index = which(!is.na(files.df$genotype))
            res.split = strsplit(files.df$genotype[index], split = ":")
            len = sapply(res.split,length)

            assertInteger(unique(len), len = 1, lower = 2, upper = 2)

            genotypeMapping.df = data.frame(samples = files.df$signal[index],
                                            genotypeFile = sapply(res.split, "[[", 1),
                                            sampleName   = sapply(res.split, "[[", 2)
            )

        }
    }

    if (par.l$poolDatasets & integrateGenotype) {
        stop("Cannot integrate genotypes automatically because of dataset pooling and potential mapping mixups. Please set the genotype column in the input data.frame to NA, and call the function again. To automatically integrate the genotypes, call the function associateGenotypes manually afterwards by prodiving a 1:1 mapping for individuals and genotypes.")
    }

    # Put into parameter list
    par.l$input = files.df
    par.l$onlyPrepareForDatasetCorrelation = onlyPrepareForDatasetCorrelation

    #########################
    # Create SNPhood object #
    #########################
    SNPhood.o  = .createSNPhoodObject(par.l, onlyPrepareForDatasetCorrelation)

    countType = "readCountsRaw"
    if (par.l$normByInput) countType = "enrichment"
    if (par.l$normAmongEachOther) countType = "readCountsNormalized"

    SNPhood.o@internal$countType = countType



    ###############################
    # Parse the user regions file #
    ###############################

    chrSizes.df = .getGenomeData(par.l$assemblyVersion, includeChrM = TRUE)
    userRegions.gr = .parseAndProcessUserRegions(par.l, chrSizes.df, verbose = verbose)

    nRegionsCur = length(userRegions.gr)

    SNPhood.o@annotation$regions = userRegions.gr
    names(SNPhood.o@annotation$regions) = SNPhood.o@annotation$regions$annotation

    SNPPos.gr = .getSNPGRangesObj(annotation(SNPhood.o))

    ###############################
    # Split the regions into bins #
    ###############################
    start.timeTemp  <-  Sys.time()

    if (!onlyPrepareForDatasetCorrelation) {
        # This has to be done before parsing the BAM file, as the user regions may change as a consequence of the lastBin parameter

        if (verbose) message(" Split regions into bins")
        res.l = .createBins(annotation(SNPhood.o)$regions,  parameters(SNPhood.o), applyLastBinStrategy = TRUE, chrSizes.df)

        bins.l                  = res.l$bins
        SNPhood.o@annotation$regions = res.l$userRegions

        nRegionsBins   = length(bins.l$chr)
        nBinsCur    = max(table(bins.l$origRegionIndex))

        if (verbose) message(" Split ", nRegionsCur," entries into ", nRegionsBins," bins")

        SNPhood.o@annotation$bins  = paste0("bin_", seq_len(nBinsCur))

        # Store the bin number where the SNP is located
        SNPhood.o@internal$plot_origBinSNPPosition = (nBinsCur + 1)/2

        SNPhood.o@internal$plot_labelBins = .calcBinLabelsPlot(nBinsCur)

    }


    .printExecutionTime(start.timeTemp, verbose)

    #.getMemoryProfile(SNPhood.o, verbose = verbose)


    #######################################################
    # Prepare the data structure if not initialzed before #
    #######################################################

    # Parse the first signal file and determine the number and names of the read groups
    res.l = .BAMHeaderConsistencyChecks(par.l, SNPhood.o@annotation$regions, files.df$signal[1], NULL, verbose = TRUE)
    SNPhood.o@config$readGroupSpecific = res.l$readGroupSpecific
    SNPhood.o = .initSNPhoodObject(SNPhood.o, files.df, res.l$readGroups, onlyPrepareForDatasetCorrelation)


    ###############################################
    # ITERATE OVER ALL UNIQUE SETS OF INPUT FILES #
    ###############################################


    # Check if the set of input files has already been read in before
    # Use an extra data structure to track which files or combinations of files have already been processed
    processedFileSets.l = list()


    inputFileSetCounter = 0
    for (inputFileSetCur in unique(files.df$input)) {

        start_inputSet = Sys.time()
        inputFileSetCounter = inputFileSetCounter + 1

        if (verbose) message("\nPROCESS INPUT FILE SET ", inputFileSetCur, " (", inputFileSetCounter, " of ", length(unique(files.df$input)), ")")

        # Identify which rows have this set of input files
        indexRowsInput.vec = which(files.df$input  ==  inputFileSetCur)

        if (is.na(inputFileSetCur)) {
            indexRowsInput.vec = which(is.na(files.df$input))
        }

        if (par.l$normByInput) {

            inputFileSetCur.vec = strsplit(inputFileSetCur, ",", fixed = TRUE)[[1]]
            if (verbose) message(" Corresponding set of input files that will be used as a control: ", paste0(seq_len(length(inputFileSetCur.vec)), ":", inputFileSetCur.vec, collapse = ","))

            #################################################################
            # ITERATE OVER ALL UNIQUE INPUT FILES THAT ARE PART OF THIS SET #
            #################################################################

            if (testNull(processedFileSets.l[[inputFileSetCur]])) {

                readGroups = annotationReadGroups(SNPhood.o)
                results.l = .extractAndNormalize(par.l, SNPhood.o@annotation$regions, readGroups, SNPPos.gr, inputFileSetCur.vec, inputFileSetCur, bins.l, type = "input", verbose = TRUE)

                SNPhood.o@internal$globalBackground[[inputFileSetCur]] = results.l$globalBackground

                # Will only be one read group here, otherwise normalization cannot happen
                for (alleleCur in readGroups) {

                    SNPhood.o@readCountsBinned     [[alleleCur]] [[inputFileSetCur]] = results.l$readCountsBinned   [[alleleCur]]
                    SNPhood.o@readCountsUnbinned   [[alleleCur]] [[inputFileSetCur]] = results.l$readCountsUnbinned [[alleleCur]]

                }

                rm(results.l)

            }

            processedFileSets.l[[inputFileSetCur]] = TRUE

            if (verbose) message("\nFINISHED PARSING INPUT FILES.\n")

            if (verbose) message("\nPROCESS INDIVIDUALS WITH INPUT SET ", inputFileSetCur)
        }

        #################################################################################
        # ITERATE OVER ALL UNIQUE SETS OF INDIVIDUALS WITH THE GIVEN SET OF INPUT FILES #
        #################################################################################

        individualsToProcess = as.character(unique(files.df$individual[indexRowsInput.vec]))

        nIndividualsToProcess = length(individualsToProcess)
        nIndProcessed = 0

        for (individualCur in individualsToProcess) {

            nIndProcessed = nIndProcessed + 1
            # Identify all rows that match
            indexRowsSignal.vec = which(files.df$individual  ==  individualCur)


            if (verbose) message("\nPROCESS INDIVIDUAL ", nIndProcessed," from ", nIndividualsToProcess,": ", individualCur)
            start_individual = Sys.time()


            ##############################################################
            # ITERATE OVER ALL SIGNAL FILES WITH THE GIVEN INDIVIDUAL ID #
            ##############################################################

            signalFilesIndAll = as.character(unique(files.df$signal[indexRowsSignal.vec]))

            SNPhood.o@annotation$files[[individualCur]] =
                list(type = "signal", files = signalFilesIndAll, composite = TRUE, origName = individualCur, input = NA)


            if (par.l$normByInput) {

                # Add the information about corresponding input files
                isComposite = ifelse(length(inputFileSetCur.vec) > 1, TRUE, FALSE)

                SNPhood.o@annotation$files[[individualCur]]$input =
                    list(files = inputFileSetCur.vec, composite = isComposite, origName = inputFileSetCur)
            }



            ####################################################################
            # Parse all files that belong to the individual and extract counts #
            ####################################################################

            readGroups = annotationReadGroups(SNPhood.o)
            results.l = .extractAndNormalize(par.l, SNPhood.o@annotation$regions, readGroups, SNPPos.gr, signalFilesIndAll, individualCur, bins.l, type = "signal", verbose = TRUE)

            SNPhood.o@internal$globalBackground[[individualCur]] = results.l$globalBackground

            for (alleleCur in readGroups) {
                SNPhood.o@internal$readStartPos[[alleleCur]] [[individualCur]] = results.l$readStartPos       [[alleleCur]]
                SNPhood.o@internal$readWidth   [[alleleCur]] [[individualCur]] = results.l$readWidth          [[alleleCur]]
                SNPhood.o@readCountsBinned     [[alleleCur]] [[individualCur]] = results.l$readCountsBinned   [[alleleCur]]
                SNPhood.o@readCountsUnbinned   [[alleleCur]] [[individualCur]] = results.l$readCountsUnbinned [[alleleCur]]

                SNPhood.o@annotation$genotype$readsDerived [[alleleCur]] [[individualCur]] = results.l$genotypes [[alleleCur]]


            }

            rm(results.l)


            ####################
            # Normalize counts #
            ####################
            if (par.l$normByInput) {

                alleleCur = "allReadGroups"

                if (verbose) message(" Normalize library sizes...")

                counts.m = matrix(c(unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[individualCur]]  , use.names = FALSE),
                                    unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[inputFileSetCur]], use.names = FALSE)), byrow = FALSE, nrow = nRegionsCur)

                stopifnot(all(counts.m >= 0))

                colnames(counts.m) = c(individualCur, inputFileSetCur)
                res.l = .scaleLibraries(counts.m)

                # Update the library sizes and count matrices
                sizeFactors.vec         = res.l$sizeFactors
                rm(res.l)

                # Important: Append the size factors because both signal and input may be used multiple times with other signals and input files, respectively, each of which translates to a different size factor
                # Change the name so that it can be traced back to the corresponding signal data file

                # Switch so that the mapping is composed of a signal and the corresponding input datafile always

                # Save the size factor. Reverse the names here to make the mapping properly. This is correct and verified!
                # TODO: Remove, might be too complicated? Really worth saving in object?
                names(sizeFactors.vec) = rev(names(sizeFactors.vec))
                SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]   = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]  , sizeFactors.vec[inputFileSetCur])
                SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]], sizeFactors.vec[individualCur])


                ###########################################
                # Calculate enrichment ratio for each bin #
                ###########################################
                # Divide the observed signal by the maximum of the global average per bin and the region mean.
                # Thereby, make it less vulnerable to stochasticity as compared to calculating it by the observed counts in each bin

                sizeFactorInput      = SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]]
                sizeFactorIndividual = SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]

                nReadsAvgBinGlobalInputNorm = SNPhood.o@internal$globalBackground[[inputFileSetCur]] / sizeFactorInput
                nReadsAvgBinLocalInputNorm  = rowMeans(SNPhood.o@readCountsBinned[[alleleCur]] [[inputFileSetCur]] / sizeFactorInput)

                # The normalization factor is specific for each region but identical among bins of a particular region

                # First produce a vector of length nRegions with the maximum of global and local bin
                maxBinAvg.vec = pmax(nReadsAvgBinGlobalInputNorm, nReadsAvgBinLocalInputNorm)

                # Now also incorporate the actual observed normalized read count values for each bin
                maxAllInput.m = pmax(matrix(rep(maxBinAvg.vec, nBinsCur), ncol = nBinsCur),
                                     (SNPhood.o@readCountsBinned[[alleleCur]] [[inputFileSetCur]] / sizeFactorInput)
                )


                SNPhood.o@enrichmentBinned[[alleleCur]] [[individualCur]] =
                    (SNPhood.o@readCountsBinned[[alleleCur]] [[individualCur]] / sizeFactorIndividual) / maxAllInput.m


                rm(maxAllInput.m,maxBinAvg.vec)

            } # end input size normalization


            #.getMemoryProfile(SNPhood.o, verbose = verbose)

            #############################################
            # SAVE SOME MEMORY AND DELETE UNNEEDED DATA #
            #############################################

            if (!SNPhood.o@config$keepAllReadCounts) {

                # Save memory and delete read count objects from the files before pooling
                if (SNPhood.o@config$poolDatasets) {

                    # Delete files for which composite is not TRUE.
                    filesToDelete = SNPhood.o@annotation$files[[individualCur]]$files

                    if (par.l$normByInput) {
                        filesDelete = c(filesDelete, SNPhood.o@annotation$files[[individualCur]]$input$files)
                    }

                    for (fileCur in filesToDelete) {

                        SNPhood.o@annotation$files[[fileCur]] = NULL

                        for (readGroupCur in annotationReadGroups(SNPhood.o)) {

                            SNPhood.o@readCountsUnbinned   [[readGroupCur]] [[fileCur]] = NULL
                            SNPhood.o@readCountsBinned     [[readGroupCur]] [[fileCur]] = NULL
                            SNPhood.o@annotation$genotype$readsDerived[[readGroupCur]] [[fileCur]] = NULL
                            SNPhood.o@internal$readStartPos[[readGroupCur]] [[fileCur]] = NULL
                            SNPhood.o@internal$readWidth   [[readGroupCur]] [[fileCur]] = NULL
                        }

                    }

                }

                # Reset slots readCountsBinned because it has been replaced by enrichment

                for (readGroupCur in annotationReadGroups(SNPhood.o)) {

                    if (SNPhood.o@config$normByInput) {
                        SNPhood.o@readCountsBinned[[readGroupCur]][[individualCur]] = NULL

                    }
                }


            } # end if (!SNPhood.o@config$keepAllReadCounts)


            # Optimize memory footprint and sort read start positions and width
            if (nReadGroups(SNPhood.o) > 1) {

                for (readGroupCur in annotationReadGroups(SNPhood.o)) {

                    nReads.vec = sapply(SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]], length)

                    for (regionCur in which(nReads.vec > 0)) {

                        sortOrder = order(SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]])

                        SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]] = SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]][sortOrder]
                        SNPhood.o@internal$readWidth   [[readGroupCur]][[individualCur]][[regionCur]] = SNPhood.o@internal$readWidth   [[readGroupCur]][[individualCur]][[regionCur]][sortOrder]

                        # Can it be further compressed?
                        uniqueVal = unique(SNPhood.o@internal$readWidth[[readGroupCur]][[individualCur]][[regionCur]])
                        if (length(uniqueVal) == 1) {
                            SNPhood.o@internal$readWidth[[readGroupCur]][[individualCur]][[regionCur]] = uniqueVal
                        }

                    }


                }

            } # end if (nReadGroups(SNPhood.o) > 1)

            #.getMemoryProfile(SNPhood.o, verbose = verbose)

            if (verbose) message("\nFINISHED PROCESSING INDIVIDUAL\n")
            .printExecutionTime(start_individual, verbose = verbose)


        }  # end for (individualCur in unique(files.df$individual[indexRowsInput.vec]))

        if (verbose) message("\nFINISHED PROCESSING INPUT SET ", inputFileSetCur, ".\n")
        .printExecutionTime(start_inputSet, verbose = verbose)


    } # end for each unique set of input files

    if (onlyPrepareForDatasetCorrelation) {

        if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALL INPUT FILES.\n")
        .printExecutionTime(start.time, verbose = verbose)

        warning("Note that you set the parameter \"onlyPrepareForDatasetCorrelation\" to TRUE. You will not be able to use any functionality except the sample correlation plots. For a full analysis, run the function again and set the parameter to FALSE.")

        message("Warnings may have occurred, as indicated. Please check them carefully with unique(warnings()).")
        return(SNPhood.o)
    }

    #######################################################################################
    # Scale all libraries to a common size if no other normalization has been done before #
    #######################################################################################
    if (par.l$normAmongEachOther) {

        if (verbose) message("Parameter \"normAmongEachOther\" has been set to TRUE. Normalize all files among each other.")

        alleleCur = annotationReadGroups(SNPhood.o)
        stopifnot(length(alleleCur) == 1)

        # Construct a matrix with the region counts
        counts.m = matrix(NA, ncol =  nIndividualsToProcess , nrow = nRegionsCur)

        uniqueInd = unique(files.df$individual)
        for (i in seq_len(length(uniqueInd))) {
            counts.m[, i] = unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]], use.names = FALSE)
        }

        colnames(counts.m) =  uniqueInd

        # Obtain size factors
        res.l        = .scaleLibraries(counts.m)

        # Divide each matrix by the size factors
        for (i in seq_len(length(uniqueInd))) {

            SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]] = SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]] / res.l$sizeFactors[i]
            SNPhood.o@readCountsBinned  [[alleleCur]] [[uniqueInd[i]]] = SNPhood.o@readCountsBinned  [[alleleCur]] [[uniqueInd[i]]] / res.l$sizeFactors[i]

        }


    } # end if normalize among each other


    ##################################
    # INTEGRATE GENOTYPE IF PROVIDED #
    ##################################

    if (integrateGenotype) {
        SNPhood.o = associateGenotypes(SNPhood.o, genotypeMapping.df, verbose)
    }

    ############################
    # DETERMINE HETEROZYGOSITY #
    ############################
    SNPhood.o = .determineHeterozygosity(SNPhood.o, verbose = verbose)


    # Set the flag that we now have an object that contains all data, validity checkIng is now active
    SNPhood.o@internal$disableObjectIntegrityChecking = FALSE
    .checkObjectValidity(SNPhood.o, verbose = verbose)

    #.getMemoryProfile(SNPhood.o, verbose = verbose)

    if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALL INPUT FILES.\n")
    .printExecutionTime(start.time, verbose = verbose)

    message("Warnings may have occurred, as indicated. Please check them carefully with unique(warnings()).")

    return(SNPhood.o)

}


#' @import GenomicRanges
# @importFrom GenomicRanges mcols start<- end<-
#' @import checkmate
.getSNPGRangesObj  <-  function(annotation) {

    SNPPos.gr = annotation$regions
    start(SNPPos.gr) = mcols(SNPPos.gr)$SNPPos
    end(SNPPos.gr)   = mcols(SNPPos.gr)$SNPPos

    SNPPos.gr

}

#' @import checkmate
.getFieldsForBAMParsing <- function(type) {

    assertChoice(type, c("regions","SNPs"))

    fieldsToRetrieve = c("strand","pos","qwidth")

    if (type == "SNPs") {
        fieldsToRetrieve = c("strand","pos", "seq")
    }

    fieldsToRetrieve
}


##############################
##############################
# ANALYSIS AND VISUALIZATION #
##############################
##############################

#' Perform an allelic bias tests for each user region and bin.
#'
#' \code{testForAllelicBiases} performs tests for allelic biases for each binned user region using binomial tests.
#' For the parameter \code{readGroups}, the name of exactly two read groups must be provided for which allelic ratio tests should be performed.
#' See the Vignette for more details.
#' @template SNPhood
#' @template readGroups
#' @param confLevel Numeric(1). Default 0.95. The confidence level for estimating the confidence intervals. Must be between 0 and 1.
#' @param nullHypothesisFraction Numeric(1). Default 0.5. The expected probability under the null hypothesis of not having any bias. Must be between 0 and 1.
#' @param calcBackgroundDistr Logical(1). Default \code{TRUE}. Should the background distribution be calculated? Note that this can be usually very time-consuming.
#' @param nRepetitions Integer(1). Default 10. Number of repetitions for calculating the background distribution.
#' Only relevant if \code{calcBackgroundDistr} is set to \code{TRUE}
#' @param pValuesToTestBackground Numeric. Default c(0.0001, 0.0005, 0.001, 0.005, seq(0.01,1,0.01)).
#' Set of p-values for which corresponding FDR values will be computed
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with all the data from the allelic bias test stored in the slot \code{additionalResults},
#' which can be easily retrieved via the accessor function \code{results}.
#' See the help pages of the result function (?results) or the vignette for details.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' ## Perform the test without calculating the background distribution
#' SNPhood.o = testForAllelicBiases (SNPhood.o, readGroups = c("paternal","maternal"))
#' str(results(SNPhood.o, type="allelicBias"), list.len = 8)
#' ## Check the parameters
#' results(SNPhood.o, type="allelicBias", elements = "parameters")

#' @export
#' @import checkmate
# TODO: Get rid of the object copying, but how?
testForAllelicBiases <- function(SNPhood.o,
                                 readGroups,
                                 confLevel = 0.95,
                                 nullHypothesisFraction = 0.5,
                                 calcBackgroundDistr = TRUE,
                                 nRepetitions = 100,
                                 pValuesToTestBackground = c(0.0001, 0.0005, 0.001, 0.005, seq(0.01,1,0.01)),
                                 verbose = TRUE) {

    start.time  <-  Sys.time()

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertSubset(readGroups, annotationReadGroups(SNPhood.o))
    assertCharacter(readGroups, len = 2)
    assertNumber(nullHypothesisFraction, lower = 0, upper = 1)
    assertNumber(confLevel, lower = 0, upper = 1)
    assertFlag(calcBackgroundDistr)
    if (calcBackgroundDistr) {
        assertInt(nRepetitions, lower = 1)
        assertNumeric(pValuesToTestBackground, lower = 0, upper = 1, any.missing = FALSE)
    }

    # Only perform background distribution if the number of bins is at least 5.
    # This is a temporary design decision and may be changed in the near future
    if (calcBackgroundDistr & nBins(SNPhood.o) < 5) {
        stop("The number of bins is smaller than 5. Currently, background normalization is only performed when the number of bins is at least 5. Please set calcBackgroundDistr to FALSE and run again. See the vignette for more help.")
    }


    # Get the matrices ready and perform a few tests

    res.l = list()

    res.l$pValue           = list()
    res.l$confIntervalMin  = list()
    res.l$confIntervalMax  = list()
    res.l$fractionEstimate = list()
    res.l$background       = list()
    res.l$FDR_results      = list()


    # Perform binomial tests

    nRegionsCur = nRegions(SNPhood.o)
    nBinsCur    = nBins(SNPhood.o)

    # Bins only needed for the background calculation
    if (calcBackgroundDistr) {
        bins.l = .createBins(annotation(SNPhood.o)$regions,  parameters(SNPhood.o), applyLastBinStrategy = TRUE, .getGenomeData(parameters(SNPhood.o)$assemblyVersion), verbose = verbose)$bins
    }

    for (individualCur in annotationDatasets(SNPhood.o)) {

        if (verbose) message("Test allelic bias for dataset ",individualCur)

        # Init the matrix
        res.l$pValue[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)

        res.l$confIntervalMin[[individualCur]]  = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
        res.l$confIntervalMax[[individualCur]]  = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)

        res.l$confIntervalMin[[individualCur]]  = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
        res.l$fractionEstimate[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)

        # Do it for the correct alleles
        indexAlleles = which(annotationReadGroups(SNPhood.o) %in% readGroups)

        stopifnot(length(indexAlleles)  ==  2)


        res.l$pValue[[individualCur]] =
            matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
                                        as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
                                        probSuccess = nullHypothesisFraction,
                                        returnType = "p.value",
                                        indexResult = 1,
                                        conf.level = confLevel),
                   nrow = nRegionsCur, ncol = nBinsCur)

        res.l$confIntervalMin[[individualCur]] =
            matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
                                        as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
                                        probSuccess = nullHypothesisFraction,
                                        returnType = "conf.int",
                                        indexResult = 1,
                                        conf.level = confLevel),
                   nrow = nRegionsCur, ncol = nBinsCur)


        res.l$confIntervalMax[[individualCur]] =
            matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
                                        as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
                                        probSuccess = nullHypothesisFraction,
                                        returnType = "conf.int",
                                        indexResult = 2,
                                        conf.level = confLevel),
                   nrow = nRegionsCur, ncol = nBinsCur)


        res.l$fractionEstimate[[individualCur]] =
            matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
                                        as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
                                        probSuccess = nullHypothesisFraction,
                                        returnType = "estimate",
                                        conf.level = confLevel),
                   nrow = nRegionsCur, ncol = nBinsCur)

        if (calcBackgroundDistr) {

            # First calculate the read overlaps. Parse the BAM files again for this. This is timely but only needed here

            if (verbose) message(" Calculating read overlaps for the background distribution... This may take a while")

            readOverlaps.l = list()
            for (x in readGroups) {
                if (verbose) message(" Read group ",x)
                readOverlaps.l[[x]] = .calculateOverlapsReads(
                    SNPhood.o@internal$readStartPos[[x]] [[individualCur]],
                    SNPhood.o@internal$readWidth   [[x]] [[individualCur]],
                    parameters(SNPhood.o),
                    bins.l,
                    SNPhood.o@readCountsUnbinned   [[x]] [[individualCur]],
                    verbose)
            }

            pValuesMinReal = rep(NA, nRegionsCur)
            # the - is important because there is no min.col so reverse the sign of the elements
            pMinIndex = max.col(-res.l$pValue[[individualCur]])
            pValuesMinReal = sapply(1:nRegionsCur, function(x) {res.l$pValue[[individualCur]][x, pMinIndex[x]]})

            # Do the same for the shuffled results

            res.l$background[[individualCur]] = list()

            # Save the minimal p value per region and repetition here. Preallocate space for improved efficiency
            pValuesMinSim = rep(NA, nRepetitions * nRegionsCur)


            if (verbose)
                message(" Calculate background distribution with ",nRepetitions," repetitions. This may take a while.")

            # Calculate the background distribution in parallel. Get shuffled reads for both alleles
            # Independent of x. Could also be removed from the function presumably
            .callRandomBackgroundDistr <- function(x) {
                .calcRandomBackgroundDistr(nRegionsCur, nBinsCur, readOverlaps.l)
            }


            res.shuffled.l = .execInParallelGen(nCores = parameters(SNPhood.o)$nCores,
                                                returnAsList = TRUE,
                                                listNames = NULL,
                                                iteration = seq_len(nRepetitions),
                                                abortIfErrorParallel = TRUE,
                                                verbose = FALSE,
                                                functionName = .callRandomBackgroundDistr)



            for (repCur in 1:nRepetitions) {

                # pValuesMinSim2[[repCur]] = data.frame(pValueShuffled = numeric(nRegionsCur))

                res.l$background[[individualCur]][[repCur]] = matrix(
                    .calcBinomTestVector(
                        as.vector(res.shuffled.l[[repCur]][[1]]),
                        as.vector(res.shuffled.l[[repCur]][[2]]),
                        returnType = "p.value",
                        indexResult = 1,
                        conf.level = confLevel
                    ),
                    nrow = nRegionsCur, ncol = nBinsCur
                )

                # Determine the smallest p values for each region and each repetition

                # min.col does not exist, so we just invert the values and use max.col
                pMinIndex = max.col(-res.l$background[[individualCur]][[repCur]])

                startIndex = (repCur - 1) * nRegionsCur + 1
                endIndex = startIndex + nRegionsCur - 1
                pValuesMinSim[startIndex:endIndex] = sapply(1:nRegionsCur, function(x) {
                    res.l$background[[individualCur]][[repCur]][x, pMinIndex[x]]
                })


            }

            resManual.df = data.frame(FDR = numeric(),
                                      pValueThreshold = numeric(),
                                      nReal = numeric(),
                                      nSim = numeric())


            for (pThresCur in pValuesToTestBackground) {

                # Average number of significant results
                nSim = length(which(pValuesMinSim <= pThresCur)) / nRepetitions

                # Observed number of real significant results
                nReal = length(which(pValuesMinReal <= pThresCur))

                # FDR is calculated over all user regions
                fdr = nSim / (nReal + nSim)

                if ((nReal + nSim) == 0) {
                    fdr = 0
                }

                resManual.df = rbind(resManual.df, data.frame(pValueThreshold = pThresCur,
                                                              FDR = fdr,
                                                              nReal = nReal,
                                                              nSim = nSim))
            }

            res.l$FDR_results[[individualCur]] = resManual.df
        } # end if (calcBackgroundDistr)


    } # end for (i in annotationDatasets(SNPhood.o)) {



    # Finish up
    readGroupsDiscarded = annotationReadGroups(SNPhood.o)[which(!annotationReadGroups(SNPhood.o) %in% readGroups)]
    res.l$parameters = list(readGroupsTested = readGroups,
                            readGroupsDiscarded = readGroupsDiscarded,
                            confLevel = confLevel,
                            calculateBackgroundDistribution = calcBackgroundDistr,
                            nRepetitionsBackground = nRepetitions,
                            pValuesToTestBackground = pValuesToTestBackground)

    SNPhood.o@additionalResults$allelicBias = res.l

    if (!"allelicBias" %in% SNPhood.o@internal$addResultsElementsAdded) {
        SNPhood.o@internal$addResultsElementsAdded = c(SNPhood.o@internal$addResultsElementsAdded, "allelicBias")

    }

    if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALLELIC BIAS TEST.\n")
    .printExecutionTime(start.time, verbose = verbose)

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o

}


#' Get the number of SNP regions for a \emph{SNPhood} object.
#'
#' \code{nRegions} is a helper function that returns the number of SNP regions that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of SNP regions that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nRegions(SNPhood.o)
#' @aliases regions
#' @rdname regions-methods
#' @export
nRegions <- function(SNPhood.o,
                     verbose = FALSE) {

    assertFlag(verbose)
    length(SNPhood.o@annotation$regions)
}

#' Get the number of bins for a \emph{SNPhood} object.
#'
#' Return the number of bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of bins that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nBins(SNPhood.o)
#' @export
#' @aliases bins
#' @rdname bins-methods
nBins <- function(SNPhood.o,
                  verbose = FALSE) {

    assertFlag(verbose)

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        return(0)
    }

    length(SNPhood.o@annotation$bins)
}

#' Get the number of datasets for a \emph{SNPhood} object.
#'
#' Return the number of datasets that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of datasets that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nDatasets(SNPhood.o)
#' @export
#' @aliases datasets
#' @rdname datasets-methods
nDatasets <- function(SNPhood.o,
                      verbose = FALSE) {

    assertFlag(verbose)
    length(SNPhood.o@annotation$files)
}

#' Get the number of read groups for a \emph{SNPhood} object.
#'
#' Return the number of read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of read groups that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nReadGroups(SNPhood.o)
#' @export
#' @aliases readGroups
#' @rdname readGroups-methods
nReadGroups <- function(SNPhood.o,
                        verbose = FALSE) {

    assertFlag(verbose)
    length(SNPhood.o@annotation$readGroups)
}

#' Get the annotation of SNP regions for a \emph{SNPhood} object.
#'
#' Return the annotation of the SNP regions that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @param asGRangesObj Logical(1). Default FALSE. Should the full annotation be returned (as \code{GRanges} object) or only the annotation of the SNP regions (as character vector)?
#' @template verbose_FALSE
#' @return If \code{asGRangesObj} is set to \code{TRUE}, a \code{GRanges} object is returned. Otherwise, a character vector with the currently stored SNP annotation is returned.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' IDs.vec = annotationRegions(SNPhood.o, asGRangesObj = FALSE)
#' IDs.gr  = annotationRegions(SNPhood.o, asGRangesObj = TRUE)
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges mcols
annotationRegions <- function(SNPhood.o,
                              asGRangesObj = FALSE,
                              verbose = FALSE) {

    assertFlag(verbose)
    assertFlag(asGRangesObj)



    if (!asGRangesObj) {
        return(mcols(SNPhood.o@annotation$regions)$annotation)
    } else {
        return(SNPhood.o@annotation$regions)
    }
}


#' Get the annotation(names) of the datasets in a \emph{SNPhood} object.
#'
#' Return the names of the datasets/individuals that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the datasets/individuals that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationDatasets(SNPhood.o)
#' @export
annotationDatasets <- function(SNPhood.o,
                               verbose = FALSE) {

    assertFlag(verbose)
    names(SNPhood.o@annotation$files)
}


#' Get the annotation(names) of the read groups in a \emph{SNPhood} object.
#'
#' Return the names of the read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationReadGroups(SNPhood.o)
#' @export
annotationReadGroups <- function(SNPhood.o,
                                 verbose = FALSE) {

    assertFlag(verbose)
    SNPhood.o@annotation$readGroups
}

#' Get the annotation(names) of the bins in a \emph{SNPhood} object.
#'
#' Return the names of the Bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the bins that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationReadGroups(SNPhood.o)
#' @export

annotationBins <- function(SNPhood.o,
                           verbose = FALSE) {

    assertFlag(verbose)
    SNPhood.o@annotation$bins
}



#' Merges the counts of all read groups for a \emph{SNPhood} object
#'
#' \code{mergeReadGroups} merges the counts of all read groups for a \code{\linkS4class{SNPhood}} object. This function can only be executed if more than one read group is defined in the object and
#' if read counts have not been converted into allelic fractions. Also carefully note the warning below.
#' @template SNPhood
#' @param summaryFunction Character(1). Default "sum". Either "sum" or "mean". How should the read counts from different read groups be summarized.
#' If set to "sum", all counts are summed up, which yields values that are identical as running the main analysis non-allele-specifically.
#' If set to "mean", the mean value across all read groups is calculated.
#' @template verbose_TRUE
#' @return A modified \code{\linkS4class{SNPhood}} object with only one read group "allReadGroups", with all occurences of
#' the original read groups replaced by "allReadGroups". For object consistency, as mentioned in the warning below,
#' some results from analyses depending on read groups are removed completely.

#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nReadGroups(SNPhood.o)
#' SNPhood_merged.o = mergeReadGroups(SNPhood.o)
#' nReadGroups(SNPhood.o)
#' @section Warning:
#' \strong{Merging read groups is irreversible. This transformation cannot be undone.}
#' It might therefore be advisable to save the resulting object in a new variable as shown in the examples.
#'
#' \strong{Results from the allelic bias test and clustering results will also be removed to keep the object consistent.}
#' @export
#' @import checkmate
mergeReadGroups  <-  function(SNPhood.o,
                              summaryFunction = "sum",
                              verbose = TRUE) {

    # TODO: merge only a particular set of read groups

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertChoice(summaryFunction, c("mean", "sum"), .var.name = summaryFunction)

    if (SNPhood.o@internal$isAllelicRatio) {
        warning("Cannot merge read groups because read counts have been converted to allelic fractions. Object will remain unchanged.")
        return(SNPhood.o)
    }

    uniqueReadGroups = annotationReadGroups(SNPhood.o)
    if (length(uniqueReadGroups)  ==  1) {
        warning("Cannot summarize read groups because only one read group was found: ", uniqueReadGroups,". Object will remain unchanged.", sep = "")
        return(SNPhood.o)
    }

    nColumns  = nBins(SNPhood.o)
    nRows     = nRegions(SNPhood.o)


    for (i in 1:nDatasets(SNPhood.o)) {

        # Need to store multiple matrices in memory to properly summarize them later
        matrices.l  = list()
        matrices2.l = list()
        for (j in seq_len(length(uniqueReadGroups))) {
            matrices.l[[j]]  =  SNPhood.o@readCountsBinned  [[uniqueReadGroups[j]]] [[i]]
            matrices2.l[[j]] =  SNPhood.o@readCountsUnbinned[[uniqueReadGroups[j]]] [[i]]
        }

        #  Make a 3D array from the list of matrices
        arr  <-   array( unlist(matrices.l)  , c(nRows, nColumns, length(matrices.l)) )
        arr2  <-  array( unlist(matrices2.l) , c(nRows, nColumns, length(matrices2.l)) )

        # Now summarize

        if (summaryFunction  ==  "mean") {
            SNPhood.o@readCountsBinned  [["allReadGroups"]] [[i]] = rowMeans(arr, dims = 2 )
            SNPhood.o@readCountsUnbinned[["allReadGroups"]] [[i]] = rowMeans(arr2, dims = 2 )


        } else if (summaryFunction  ==  "sum") {

            SNPhood.o@readCountsBinned  [["allReadGroups"]] [[i]] = Reduce('+', matrices.l)
            SNPhood.o@readCountsUnbinned[["allReadGroups"]] [[i]] = Reduce('+', matrices2.l)
        }
    }

    names(SNPhood.o@readCountsBinned  [["allReadGroups"]]) = names(SNPhood.o@readCountsBinned[[1]])
    names(SNPhood.o@readCountsUnbinned[["allReadGroups"]]) = names(SNPhood.o@readCountsUnbinned[[1]])

    # Currently, the enrichment slot will be empty because enrichment is only possible if done non-allele-specifically

    # Delete old read groups
    for (readGroupCur in uniqueReadGroups) {
        SNPhood.o@readCountsBinned[[readGroupCur]] = NULL
        SNPhood.o@readCountsUnbinned[[readGroupCur]] = NULL
        SNPhood.o@enrichmentBinned[[readGroupCur]] = NULL
        SNPhood.o@internal$sizeFactors[[readGroupCur]] = NULL

        # Delete clustering results also
        if (!testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned[[readGroupCur]])) {
            SNPhood.o@additionalResults$clustering[[readGroupCur]] = NULL
            SNPhood.o@additionalResults$clustering[[readGroupCur]] = NULL
        }
    }

    # Reset, not meaningful to merge the old values anyway
    SNPhood.o@enrichmentBinned[["allReadGroups"]] = list()
    SNPhood.o@internal$sizeFactors[["allReadGroups"]] = list()

    # Read start pos and width
    SNPhood.o@internal$readStartPos[["allReadGroups"]] = list()
    SNPhood.o@internal$readWidth   [["allReadGroups"]] = list()

    for (datasetCur in annotationDatasets(SNPhood.o)) {

        SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]] = vector("list", nRegions(SNPhood.o))
        SNPhood.o@internal$readWidth   [["allReadGroups"]][[datasetCur]] = vector("list", nRegions(SNPhood.o))
        for (readGroupCur in uniqueReadGroups) {

            for (regionCur in seq_len(nRegions(SNPhood.o))) {
                SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]][[regionCur]] =
                    c(SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]][[regionCur]], SNPhood.o@internal$readStartPos[[readGroupCur]][[datasetCur]][[regionCur]])

                SNPhood.o@internal$readWidth[["allReadGroups"]][[datasetCur]][[regionCur]] =
                    c(SNPhood.o@internal$readWidth[["allReadGroups"]][[datasetCur]][[regionCur]], SNPhood.o@internal$readWidth[[readGroupCur]][[datasetCur]][[regionCur]])

            }

        }

    }

    # Delete old references
    for (j in uniqueReadGroups) {
        SNPhood.o@internal$readStartPos[[j]] = NULL
        SNPhood.o@internal$readWidth[[j]]    = NULL
    }

    SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]] = list()
    for (i in annotationDatasets(SNPhood.o)) {

        isInitiated = FALSE
        for (j in uniqueReadGroups) {

            if (!isInitiated) { # Init
                SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] = SNPhood.o@annotation$genotype$readsDerived[[j]][[i]]
                isInitiated = TRUE
            } else {
                SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] = SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] +
                    SNPhood.o@annotation$genotype$readsDerived[[j]][[i]]
            }
        }
    }


    #names(SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]])[i] = annotationDatasets(SNPhood.o)[i]
    for (j in uniqueReadGroups) {
        SNPhood.o@annotation$genotype$readsDerived[[j]] = NULL
    }

    # Remove the results from the allelicBias analysis
    SNPhood.o@additionalResults$allelicBias = NULL

    # Remove the clustering results completely if nothing is left
    if (testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned)) {
        SNPhood.o@additionalResults$clustering = NULL
    }

    # Make object consistent
    SNPhood.o@annotation$readGroups = "allReadGroups"
    SNPhood.o@internal$mergedReadGroups = TRUE

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}

#' Delete a particular set of read groups.
#'
#' \code{deleteReadGroups} deletes a particular set of read groups from a \emph{SNPhood} object.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name.
#'
#' @template SNPhood
#' @template readGroups
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with read counts across read groups (both for the slots readCountsUnbinned and readCountsBinned) replaced by their respective relative fractions. Otherwise identical to the input \code{\link{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_allelicFractions.o = deleteReadGroups(SNPhood.o, "ambiguous")
#' @seealso  \code{\link{deleteDatasets}}, \code{\link{deleteRegions}}
#' @export
#' @import checkmate
deleteReadGroups   <-  function(SNPhood.o,
                                readGroups = NULL,
                                verbose = TRUE) {

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertSubset(readGroups, SNPhood.o@annotation$readGroups)
    assertCharacter(readGroups, any.missing = FALSE, min.chars = 1, min.len = 0, max.len = nReadGroups(SNPhood.o) - 1, unique = TRUE)


    if (nReadGroups(SNPhood.o)  ==  1) {
        warning("Cannot delete any read groups because only one read group has been found (", annotationReadGroups(SNPhood.o),"). Returning an unmodified object.")
        return(SNPhood.o)
    }

    maxLen = nReadGroups(SNPhood.o) - 1
    if (!testCharacter(readGroups, any.missing = FALSE, min.len = 1, min.chars = 1, max.len = maxLen)) {
        warning("Between one and ", maxLen," read groups must be specified. Returning an unmodified object.")
        return(SNPhood.o)
    }

    stopifnot(nReadGroups(SNPhood.o) > length(readGroups))

    for (readGroupCur in readGroups) {
        SNPhood.o@readCountsUnbinned   [[readGroupCur]] = NULL
        SNPhood.o@readCountsBinned     [[readGroupCur]] = NULL
        SNPhood.o@enrichmentBinned     [[readGroupCur]] = NULL
        SNPhood.o@internal$sizeFactors [[readGroupCur]] = NULL
        SNPhood.o@internal$readStartPos [[readGroupCur]] = NULL
        SNPhood.o@internal$readWidth    [[readGroupCur]] = NULL
    }

    SNPhood.o@annotation$readGroups = annotationReadGroups(SNPhood.o)[-which(annotationReadGroups(SNPhood.o) %in% readGroups)]

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    assertInt(nReadGroups(SNPhood.o), lower = 1)

    SNPhood.o
}

# newReadGroupsMapping = list("a", "b", "c")
# names(newReadGroupsMapping) = annotationReadGroups(SNPhood.o)


#' Rename read groups.
#'
#' \code{renameReadGroups} renames a set of read groups from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newReadGroupsMapping Named list. Named list. For clarity of mapping, the names of the list must be the currently defined read group names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested read groups being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("a", "b", "c")
#' names(mapping) = annotationReadGroups(SNPhood.o)
#' SNPhood_mod.o = renameReadGroups (SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameDatasets}}, \code{\link{renameRegions}}
#' @import checkmate
#' @export
renameReadGroups  <-  function(SNPhood.o,
                               newReadGroupsMapping,
                               verbose = TRUE) {

    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertList(newReadGroupsMapping, any.missing = FALSE, min.len = 1, max.len = nReadGroups(SNPhood.o))
    assertSubset(names(newReadGroupsMapping), annotationReadGroups(SNPhood.o))
    assertCharacter(unlist(newReadGroupsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)

    for (i in seq_len(length(newReadGroupsMapping))) {

        oldR = names(newReadGroupsMapping)[i]
        newR = newReadGroupsMapping[[i]]

        if (oldR == newR) next

        if (testNull(newR)) {
            stop("New name cannot be NULL for read group ", names(newReadGroupsMapping)[i])
        }

        # If the new name is already present in the object, make sure that the renaming does not cause issues
        if (newR %in% annotationReadGroups(SNPhood.o)) {
            stop("Cannot rename read group ",oldR, " to ", newR, " because the new name is already present in the object.")
        }


        names(SNPhood.o@readCountsUnbinned)[which(names(SNPhood.o@readCountsUnbinned)  ==  oldR)]     = newR

        names(SNPhood.o@readCountsBinned)[which(names(SNPhood.o@readCountsBinned)  ==  oldR)]         = newR

        names(SNPhood.o@enrichmentBinned)[which(names(SNPhood.o@enrichmentBinned)  ==  oldR)]         = newR

        names(SNPhood.o@internal$sizeFactors) [which(names(SNPhood.o@internal$sizeFactors)  ==  oldR)] = newR
        names(SNPhood.o@internal$readStartPos)[which(names(SNPhood.o@internal$readStartPos) ==  oldR)] = newR

        SNPhood.o@annotation$readGroups[which(SNPhood.o@annotation$readGroups  ==  oldR)] = newR

        names(SNPhood.o@annotation$genotype$readsDerived)[which(SNPhood.o@annotation$readGroups  ==  oldR)] = newR

        SNPhood.o@additionalResults$allelicBias$parameters$readGroupsTested   [which(SNPhood.o@additionalResults$allelicBias$parameters$readGroupsTested    == oldR)] = newR
        SNPhood.o@additionalResults$allelicBias$parameters$readGroupsDiscarded[which(SNPhood.o@additionalResults$allelicBias$parameters$readGroupsDiscarded == oldR)] = newR

    }




    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}


#' Rename datasets.
#'
#' \code{renameDatasets} renames datasets from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newDatasetsMapping Named list.  Named list. For clarity of mapping, the names of the list must be the currently defined dataset names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested datasets being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("Individual1", "Individual2")
#' names(mapping) = annotationDatasets(SNPhood.o)
#' SNPhood_mod.o = renameDatasets(SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameReadGroups}}, \code{\link{renameRegions}}
#' @export
#' @import checkmate
renameDatasets  <-  function(SNPhood.o,
                             newDatasetsMapping,
                             verbose = TRUE) {

    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    assertList(newDatasetsMapping, any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
    assertSubset(names(newDatasetsMapping), annotationDatasets(SNPhood.o))
    assertCharacter(as.character(newDatasetsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)

    for (i in seq_len(length(newDatasetsMapping))) {

        oldR = names(newDatasetsMapping)[i]
        newR = newDatasetsMapping[[i]]

        if (oldR == newR) next

        if (testNull(newR)) {
            stop("New name cannot be NULL for dataset ", names(newDatasetsMapping)[i])
        }

        # If the new name is already present in the object, make sure that the renaming does not cause issues
        if (newR %in% annotationDatasets(SNPhood.o)) {
            stop("Cannot rename dataset ",oldR, " to ", newR, " because the new name is already present in the object.")
        }

        # SLOT ANNOTATION
        names(SNPhood.o@annotation$files)[which(names(SNPhood.o@annotation$files)  ==  oldR)] = newR


        for (alleleCur in annotationReadGroups(SNPhood.o)) {

            names(SNPhood.o@annotation$genotype$readsDerived[[alleleCur]])[which(names(SNPhood.o@annotation$genotype$readsDerived[[alleleCur]]) ==  oldR)] = newR
            colnames(SNPhood.o@annotation$genotype$heterozygosity)[which(colnames(SNPhood.o@annotation$genotype$heterozygosity) == oldR)] = newR

            names(SNPhood.o@readCountsUnbinned[[alleleCur]])[which(names(SNPhood.o@readCountsUnbinned[[alleleCur]])       ==  oldR)] = newR
            names(SNPhood.o@readCountsBinned[[alleleCur]])[which(names(SNPhood.o@readCountsBinned[[alleleCur]])           ==  oldR)] = newR
            names(SNPhood.o@enrichmentBinned[[alleleCur]])[which(names(SNPhood.o@enrichmentBinned[[alleleCur]])           ==  oldR)] = newR
            names(SNPhood.o@internal$sizeFactors[[alleleCur]])[which(names(SNPhood.o@internal$sizeFactors[[alleleCur]])   ==  oldR)] = newR
            names(SNPhood.o@internal$readStartPos[[alleleCur]])[which(names(SNPhood.o@internal$readStartPos[[alleleCur]]) ==  oldR)] = newR
            names(SNPhood.o@internal$readWidth[[alleleCur]])[which(names(SNPhood.o@internal$readWidth[[alleleCur]])       ==  oldR)] = newR

        }

        if (!testNull(SNPhood.o@additionalResults$allelicBias)) {
            names(SNPhood.o@additionalResults$allelicBias$pValue)[which(names(SNPhood.o@additionalResults$allelicBias$pValue) ==  oldR)] = newR
            names(SNPhood.o@additionalResults$allelicBias$confIntervalMin)[which(names(SNPhood.o@additionalResults$allelicBias$confIntervalMin) ==  oldR)] = newR
            names(SNPhood.o@additionalResults$allelicBias$confIntervalMax)[which(names(SNPhood.o@additionalResults$allelicBias$confIntervalMax) ==  oldR)] = newR
            names(SNPhood.o@additionalResults$allelicBias$fractionEstimate)[which(names(SNPhood.o@additionalResults$allelicBias$fractionEstimate) ==  oldR)] = newR
            names(SNPhood.o@additionalResults$allelicBias$background)[which(names(SNPhood.o@additionalResults$allelicBias$background) ==  oldR)] = newR
            #TODO: dataset names
            # TODO: change implementation: just have the names appear once and use [[1]] and [[2]] or so for it throughout the object
        }


    }

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}


#' Rename bins.
#'
#' \code{renameBins} renames bins from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newBinsMapping Named list. For clarity of mapping, the names of the list must be the currently defined bin names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested bins being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("Bin1_NEW")
#' names(mapping) = annotationBins(SNPhood.o)[1]
#' SNPhood_mod.o = renameBins(SNPhood.o, mapping)
#' @seealso \code{\link{deleteDatasets}}, \code{\link{deleteRegions}}
#' @export
#' @import checkmate
renameBins  <-  function(SNPhood.o,
                         newBinsMapping,
                         verbose = TRUE) {

    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertList(newBinsMapping, any.missing = FALSE, min.len = 1, max.len = nBins(SNPhood.o))
    assertSubset(names(newBinsMapping), annotationBins(SNPhood.o))
    assertCharacter(unlist(newBinsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)

    for (i in seq_len(length(newBinsMapping))) {

        oldR = names(newBinsMapping)[i]
        newR = newBinsMapping[[i]]

        if (oldR == newR) next

        if (testNull(newR)) {
            stop("New name cannot be NULL for bin ", names(newBinsMapping)[i])
        }

        # If the new name is already present in the object, make sure that the renaming does not cause issues
        if (newR %in% annotationBins(SNPhood.o)) {
            stop("Cannot rename bin ",oldR, " to ", newR, " because the new name is already present in the object.")
        }

        SNPhood.o@annotation$bins[which(SNPhood.o@annotation$bins  ==  oldR)] = newR
    }

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}



#' Rename regions.
#'
#' \code{renameRegions} renames regions from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newRegionsMapping Named list. For clarity of mapping, the names of the list must be the currently defined region names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return An object of class \code{\link{SNPhood}} with the requested regions being renamed
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = as.list(paste0(annotationRegions(SNPhood.o),".newName"))
#' names(mapping) = annotationRegions(SNPhood.o)
#' SNPhood_mod.o = renameRegions(SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameDatasets}}, \code{\link{renameReadGroups}}
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges mcols<-
renameRegions <-  function(SNPhood.o,
                           newRegionsMapping,
                           verbose = TRUE) {

    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    assertList(newRegionsMapping, any.missing = FALSE, min.len = 1, max.len = nRegions(SNPhood.o))
    assertSubset(names(newRegionsMapping), annotationRegions(SNPhood.o))
    assertCharacter(unlist(newRegionsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)

    for (i in seq_len(length(newRegionsMapping))) {

        oldR = names(newRegionsMapping)[i]
        newR = newRegionsMapping[[i]]

        if (oldR == newR) next

        if (testNull(newR)) {
            stop("New name cannot be NULL for bin ", names(newRegionsMapping)[i])
        }

        # If the new name is already present in the object, make sure that the renaming does not cause issues
        if (newR %in% annotationRegions(SNPhood.o)) {
            stop("Cannot rename region ",oldR, " to ", newR, " because the new name is already present in the object.")
        }

        mcols(SNPhood.o@annotation$regions)$annotation[which(names(SNPhood.o@annotation$regions) == oldR)] = newR
        names(SNPhood.o@annotation$regions) [which(names(SNPhood.o@annotation$regions) == oldR)] = newR
    }

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}


#' Delete a set of user regions from a \emph{SNPhood} object.
#'
#' \code{deleteRegions} deletes a set of user regions.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name.
#'
#'
#' @template SNPhood
#' @template regions
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested regions being deleted.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' # Delete the first 10 regions
#' SNPhood_mod.o = deleteRegions(SNPhood.o, c(1:10))
#'
#' # Delete regions by their annotation
#' SNPhood_mod.o = deleteRegions(SNPhood.o, c("rs2822405", "rs467140"))
#' @section Warning:
#' \strong{Execution of this function resets the slot additionalResults and all of its results (e.g, allelic bias analysis).
#' The reason for this is that all results stored in this slot are affected by the deletion of regions}
#' @seealso \code{\link{deleteDatasets}}, \code{\link{deleteReadGroups}}
#' @export

#' @import checkmate
deleteRegions  <-  function(SNPhood.o,
                            regions,
                            verbose = TRUE) {

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    regions = .checkAndConvertRegionArgument(SNPhood.o, regions, nullAllowed = FALSE, maxLength = nRegions(SNPhood.o) - 1)

    # SLOT annotation
    SNPhood.o@annotation$regions = SNPhood.o@annotation$regions[-regions,]

    # SLOT config: Nothing to do

    # SLOT readCountsUnbinned, readCountsBinned, enrichmentBinned
    for (alleleCur in annotationReadGroups(SNPhood.o)) {

        for (fileCur in annotationDatasets(SNPhood.o)) {
            SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]] = SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]][-regions]
            SNPhood.o@readCountsBinned[[alleleCur]] [[fileCur]] = SNPhood.o@readCountsBinned[[alleleCur]] [[fileCur]][-regions,]
            if (parameters(SNPhood.o)$normByInput) {
                SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]] = SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]][-regions,]
            }

            # SLOT internal
            if (!testNull(SNPhood.o@internal$readStartPos[[alleleCur]] [[fileCur]])) {
                SNPhood.o@internal$readStartPos[[alleleCur]] [[fileCur]][regions] = NULL
                SNPhood.o@internal$readWidth[[alleleCur]] [[fileCur]][regions] = NULL
            }

        }
    }


    # SLOT additionalResults

    # Reset that, because all slots are affected by the deletion of regions
    SNPhood.o@additionalResults = list()


    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o
}

#' Delete a particular set of datasets from a \emph{SNPhood} object.
#'
#' \code{deleteDatasets} deletes a particular set of datasets from a \code{\link{SNPhood}} object.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name because the deleted datasets cannot be recovered.
#'
#' @template SNPhood
#' @template datasets
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested datasets removed from all slots.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_mod.o = deleteDatasets(SNPhood.o, c(1,2))
#' @seealso \code{\link{deleteRegions}}, \code{\link{deleteReadGroups}}
#' @export
#' @import checkmate
deleteDatasets  <-  function(SNPhood.o,
                             datasets = NULL,
                             verbose = TRUE) {

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assert(checkNull(datasets),
           checkIntegerish(datasets, lower = 1, upper = nBins(SNPhood.o), unique = TRUE, min.len = 1, max.len = nBins(SNPhood.o) - 1),
           checkCharacter(datasets, min.len = 1, max.len = nDatasets(SNPhood.o) - 1, unique = TRUE)
    )


    if (testCharacter(datasets)) {
        assertSubset(datasets, annotationDatasets(SNPhood.o))
    }

    if (testNull(datasets)) {
        warning("No datasets have been provided for deletion. Object will remain unchanged.")
        return(SNPhood.o)
    }

    # Convert integer to names
    if (testIntegerish(datasets, lower = 1, upper = nBins(SNPhood.o), unique = TRUE, min.len = 1, max.len = nBins(SNPhood.o) - 1)) {
        datasets = annotationDatasets(SNPhood.o)[datasets]
    }


    # SLOT annotation
    SNPhood.o@annotation$files =   annotation(SNPhood.o)$files[-which(names(annotation(SNPhood.o)$files) %in% datasets)]

    # Slot config: Nothing to do

    # SLOT enrichmentBinned, readCountsUnbinned, and readCountsBinned, additionalResults
    for (alleleCur in annotationReadGroups(SNPhood.o)) {

        for (fileCur in datasets) {

            if (parameters(SNPhood.o)$normByInput) {
                SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]] = NULL
            }

            SNPhood.o@annotation$genotype$readsDerived[[alleleCur]][[fileCur]] = NULL

            SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]] = NULL
            SNPhood.o@readCountsBinned  [[alleleCur]] [[fileCur]] = NULL

            if (!testNull(SNPhood.o@additionalResults$allelicBias)) {
                SNPhood.o@additionalResults$allelicBias$pValue[fileCur] = NULL
                SNPhood.o@additionalResults$allelicBias$confIntervalMin[fileCur] = NULL
                SNPhood.o@additionalResults$allelicBias$confIntervalMax[fileCur] = NULL
                SNPhood.o@additionalResults$allelicBias$fractionEstimate[fileCur] = NULL
                SNPhood.o@additionalResults$allelicBias$background[fileCur] = NULL
            }


            # Delete clustering results
            if (!testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned[[alleleCur]][[fileCur]])) {
                SNPhood.o@additionalResults$clustering[[alleleCur]][[fileCur]] = NULL
                SNPhood.o@additionalResults$clustering[[alleleCur]][[fileCur]] = NULL
            }


            # SLOT INTERNAL
            if (!testNull(SNPhood.o@internal$readStartPos[[alleleCur]][[fileCur]])) {
                SNPhood.o@internal$readStartPos[[alleleCur]][[fileCur]] = NULL
                SNPhood.o@internal$readWidth[[alleleCur]][[fileCur]] = NULL
            }

        }
    }


    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking


    SNPhood.o

}


#' Get the annotation(names) of bins in a \emph{SNPhood} object.
#'
#' \code{annotationBins2} is a helper function that returns annotation of the bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @param regions Integer or character. Default NULL. A subset of the SNP regions for which annotation is needed. Either the row numbers or the rownames(IDs) of the SNP regions are supported.
#' @param fullAnnotation Logical(1). Should the full annotation(as a data.frame) be returned or only the annotation of the bins(as a character vector)?
#' @template verbose_TRUE
#' @return If \code{fullAnnotation} is set to \code{TRUE}, a data.frame with the full annotation of the bins for the(subset of) SNP regions is returned. Otherwise, a character vector with only the annotation of the bins is returned.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotation.df  = annotationBins2(SNPhood.o, regions = 1:10, fullAnnotation = TRUE)
#' annotation.vec = annotationBins2(SNPhood.o, regions = 1:10, fullAnnotation = FALSE)
#' @section Warning:
#' \strong{The number of returned bins can easily be very large, in the order of millions.
#' Be careful because the memory consumption due the resulting object may increase considerably.}
#' Reduce memory requirements by returning only a subset of SNP regions
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges GRanges mcols
annotationBins2 <- function(SNPhood.o,
                            regions = NULL,
                            fullAnnotation = FALSE,
                            verbose = TRUE) {

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    regions = .checkAndConvertRegionArgument(SNPhood.o, regions, nullAllowed = TRUE, maxLength = NULL)

    assertFlag(fullAnnotation)
    assertFlag(verbose)

    # Grep the rows that the user wants
    if (!testNull(regions)) {
        userRegions.red.gr = SNPhood.o@annotation$regions[regions]
    } else {
        userRegions.red.gr = SNPhood.o@annotation$regions
    }

    res.l = .createBins(userRegions.red.gr, parameters(SNPhood.o), applyLastBinStrategy = TRUE, .getGenomeData(parameters(SNPhood.o)$assemblyVersion), verbose = TRUE)

    binsPerRegion.vec = table(res.l$bins$origRegionIndex)

    nBinsPerRegion = unique(binsPerRegion.vec)


    stopifnot( length(nBinsPerRegion) == 1)

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    # Create a general file format out of the list and call transformation helper function afterwards

    if (fullAnnotation) {
        bins.df = data.frame(chr = res.l$bins$chr,
                             start = res.l$bins$start,
                             end = res.l$bins$end,
                             annotationBin = paste0(mcols(userRegions.red.gr)$annotation[res.l$bins$origRegionIndex],"_bin", rep(1:nBinsPerRegion, length(userRegions.red.gr))),
                             annotationRegion = res.l$bins$origRegionIndex,
                             length = res.l$bins$length,
                             strand = res.l$bins$strand,
                             stringsAsFactors = TRUE
        )
        #
        #
        #       if (!is.null(addScoreColumn)) {
        #
        #           #Create the subset of rows that contain our regions
        #           includeScoreColumn.red.m = addScoreColumn [which(rownames(addScoreColumn) %in% mcols(userRegions.red.gr)$annotation[unique(res.l$bins$origRegionIndex)]),]
        #
        #           assertInt(nrow(includeScoreColumn.red.m), lower=1)
        #
        #           # Put the elements of the matrix into the correct order and into the bins.df dataframe(score column)
        #
        #           #Check if the order is indeed identical
        #           identical(rownames(includeScoreColumn.red.m), mcols(userRegions.red.gr)$annotation)
        #
        #           bins.df$score = as.vector(t(includeScoreColumn.red.m))
        #
        #       }
        return(bins.df)

    } else {

        return(paste0(mcols(userRegions.red.gr)$annotation[res.l$bins$origRegionIndex],"_bin", rep(1:nBinsPerRegion, length(userRegions.red.gr))))
    }


}


#' Convert read counts across read groups to relative fractions from a \emph{SNPhood} object.
#'
#' \code{convertToAllelicFractions} convert read counts across read groups to their relative fractions among all
#' read groups (all read counts will be between 0 and 1, with 1 for a particular read group depicting that all reads
#' from this particular position originate from the one read group)
#' Affected slots are \code{readCountsUnbinned} and \code{readCountsBinned}.
#' It is recommended to save the resulting \code{\link{SNPhood}} object with a new name because it is not possible to go back from
#' fractions to read counts at a later point.
#'
#' @template SNPhood
#' @param roundDigits Numeric(1). Default 2. Number of digits after the decimal place when converting read counts to fractions
#' @param setNaNToZero Logical(1). Default FALSE. Should NaN (not a number) be converted to 0? NaN may result from individual regions or
#' bins with no reads across all read groups due to a division by zero.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with read counts across read groups (both for the slots readCountsUnbinned and
#' readCountsBinned) replaced by their respective relative fractions. Otherwise identical to the input \code{\link{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_allelicFractions.o = convertToAllelicFractions(SNPhood.o)
#'
#' # Convert all NaN to 0 for subsequent analyses
#' SNPhood_allelicFractions.o = convertToAllelicFractions(SNPhood.o, setNaNToZero = TRUE)
#'
#' @seealso \code{\link{deleteReadGroups}}
#' @export
#' @import checkmate
convertToAllelicFractions  <-  function(SNPhood.o,
                                        roundDigits = 2,
                                        setNaNToZero = FALSE,
                                        verbose = TRUE) {

    # Check types and validity of arguments
    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }

    assertInt(roundDigits, lower = 0)
    assertFlag(setNaNToZero)



    if (SNPhood.o@internal$isAllelicRatio) {
        warning("Allelic ratios have already been computed. Object will remain unchanged.")
        return(SNPhood.o)
    }

    if (SNPhood.o@internal$countType  ==  "enrichment") {
        warning("Cannot calculate allelic fractions for enrichments.")
        return(SNPhood.o)
    }


    if (nReadGroups(SNPhood.o)  ==  1) {
        warning("Only one read group has been found (", annotationReadGroups(SNPhood.o),"), cannot discard anything.")
        return(SNPhood.o)
    }


    nDatasetsCur = nDatasets(SNPhood.o)
    nReadGroupsCur = nReadGroups(SNPhood.o)


    for (i in seq_len(nDatasetsCur)) {

        # Calculate the sum of the number of reads per bin
        sumBins.m = 0
        sumRegions.vec = 0
        for (j in 1:nReadGroupsCur) {
            sumBins.m      = sumBins.m      + SNPhood.o@readCountsBinned  [[j]] [[i]]
            sumRegions.vec = sumRegions.vec + SNPhood.o@readCountsUnbinned[[j]] [[i]]
        }

        # Calculate the fraction
        for (j in 1:nReadGroupsCur) {
            SNPhood.o@readCountsUnbinned[[j]] [[i]] = round(SNPhood.o@readCountsUnbinned [[j]] [[i]] / sumRegions.vec, digits =  roundDigits)
            SNPhood.o@readCountsBinned  [[j]] [[i]] = round(SNPhood.o@readCountsBinned   [[j]] [[i]] / sumBins.m,      digits =  roundDigits)

            if (setNaNToZero) {
                SNPhood.o@readCountsUnbinned[[j]] [[i]][is.nan(SNPhood.o@readCountsUnbinned[[j]] [[i]])] = 0
                SNPhood.o@readCountsBinned  [[j]] [[i]][is.nan(SNPhood.o@readCountsBinned  [[j]] [[i]])] = 0
            }
        }

    }


    SNPhood.o@internal$isAllelicRatio = TRUE
    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    SNPhood.o

}


#' Associate genotypes with user regions from a \emph{SNPhood} object.
#'
#' The function \code{associateGenotypes} associates genotypes with SNP regions as defined in a \code{\linkS4class{SNPhood}} object. It is possible to assign
#' genotypes only for a subset of datasets as defined in a \code{\linkS4class{SNPhood}} object.
#' To avoid any ambiguities, a 1:1 for genotype and dataset mapping must be given (ses below).
#' @template SNPhood
#' @param genotypeMapping Data frame. A data frame that establishes the mapping between datasets in the object and the corresponding genotype file and
#' column names. See the examples.
#' must be provided. See the Vignette for a more detailed description of the supported file format.
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with the genotype information added to the slot \code{annotation}, element genotype.
#' You may retrieve it via the accessor function \code{\link{annotation}}.
#' @export
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' fileGenotypes = list.files(pattern = "*genotypes*",system.file("extdata", package = "SNPhoodData"), full.names = TRUE)
#' mapping = data.frame(samples = annotationDatasets(SNPhood.o), genotypeFile = rep(fileGenotypes, 2), sampleName = c("NA10847", "NA12890"))
#' SNPhood.o = associateGenotypes(SNPhood.o, mapping)
#' @import checkmate
associateGenotypes <- function(SNPhood.o,
                               genotypeMapping,
                               verbose = TRUE) {

    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)
    disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
    SNPhood.o@internal$disableObjectIntegrityChecking = TRUE

    if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
        stop(.getErrorForOnlyPrepareSamplesCorrelation())
    }


    assertDataFrame(genotypeMapping, types = c("character", "factor"), any.missing = FALSE, min.rows = 1, min.cols = 3)
    #assertSubset(genotypeMapping$samples, annotationDatasets(SNPhood.o), empty.ok = FALSE)
    assertFlag(verbose)

    genotypeMapping[] <- lapply(genotypeMapping, as.character)

    assertSubset(c("samples", "genotypeFile", "sampleName"), colnames(genotypeMapping))
    assertCharacter(genotypeMapping$samples, unique = TRUE)

    # Init the data frame that stores the external genotypes
    res.df = data.frame(alleleRef = rep(NA, nRegions(SNPhood.o)),
                        alleleAlt = NA,
                        genotypeMismatch = FALSE)

    SNPhood.o@annotation$genotype$external = res.df

    # Delete NA in the list if there are any
    genotypeMapping = genotypeMapping[!is.na(genotypeMapping$samples),]

    # Check if the files all exist and are readable
    sapply(unique(genotypeMapping$genotypeFile), FUN = function(x) assertFileExists(x, access = "r"))


    # Parse VCF files
    for (file_vcf in unique(genotypeMapping$genotypeFile)) {

        if (verbose) message("\n Import genotypes from file ", file_vcf,". This may take a while...\n")
        # Get a list of all sample names that we have to extract from the file
        sampleNamesFile = genotypeMapping$sampleName[which(genotypeMapping$genotypeFile == file_vcf)]
        sampleNamesObject = genotypeMapping$samples [which(genotypeMapping$genotypeFile == file_vcf)]
        SNPhood.o = .extractGenotypesVCF(SNPhood.o, file_vcf, sampleNamesObject, sampleNamesFile, yieldSize = NULL, verbose = verbose)
    }

    if (verbose) message("The individual genotypes have been added to the annotation.")


    # Add genotype information to annotation
    for (individualCur in annotationDatasets(SNPhood.o)) {

        # If not defined, set it to NA
        index = which(genotypeMapping$samples == individualCur)
        if (length(index) == 0) {
            SNPhood.o@annotation$files[[individualCur]]$genotypeFile           = NA
            SNPhood.o@annotation$files[[individualCur]]$genotypeFileSampleName = NA
        }

        SNPhood.o@annotation$files[[individualCur]]$genotypeFile           = genotypeMapping$genotypeFile[index]
        SNPhood.o@annotation$files[[individualCur]]$genotypeFileSampleName = genotypeMapping$sampleName  [index]
    }

    SNPhood.o = .identifyGenotypeIncompatibilities(SNPhood.o, verbose = verbose)

    SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking

    return(SNPhood.o)
}

#' Disable object integrity checking for a \emph{SNPhood} object.
#'
#' The function \code{changeObjectIntegrityChecking} disables object integrity checking for a \emph{SNPhood} object.
#' This might be desired for large objects when the integrity test takes too much time.
#' Note, however, that disabling these checks is not recommended.
#' @template SNPhood
#' @param disable Logical(1). Default FALSE. Disable the object integrity checking?
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with object integrity checking disabled.
#' @export
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood.o = changeObjectIntegrityChecking(SNPhood.o, disable = TRUE)

#' @import checkmate
changeObjectIntegrityChecking <- function(SNPhood.o,
                                          disable = FALSE,
                                          verbose = TRUE) {

    assertFlag(verbose)
    .checkObjectValidity(SNPhood.o, verbose = verbose)

    SNPhood.o@internal$disableObjectIntegrityChecking = disable

    if (disable) {
        message("Object integrity checking disabled for the specific object.")
    } else {
        message("Object integrity checking enabled for the specific object.")
    }

    SNPhood.o
}


#' @import checkmate
# TODO: Change object copying
.identifyGenotypeIncompatibilities <- function(SNPhood.o,
                                               verbose = TRUE) {


    nMismatchesTotal = 0
    mismatches.l = list()

    # Do some quality checking and set all genotypes to NA if the information from the VCF file and the read-derived files differ
    for (fileCur in annotationDatasets(SNPhood.o)) {

        if (is.na(annotation(SNPhood.o)$files[[fileCur]]$genotypeFile) | is.na(annotation(SNPhood.o)$files[[fileCur]]$genotypeFileSampleName)) {
            next
        }

        if (verbose) message(" Check for genotype mismatches for file ",fileCur)

        nMismatchesFile = 0
        mismatches.l[[fileCur]] = c()

        for (regionCur in 1:nRegions(SNPhood.o)) {

            # Sum up all genotypes across read groups
            for (alleleCur in 1:nReadGroups(SNPhood.o)) {

                dist = annotation(SNPhood.o)$genotype$readsDerived[[annotationReadGroups(SNPhood.o)[alleleCur]]][[fileCur]][, regionCur]
                if (alleleCur == 1) {
                    readDerived = dist
                } else {
                    readDerived = readDerived + dist
                }

            }

            readDerivedPresent = as.character(names(readDerived[readDerived > 0]))

            if (length(readDerivedPresent) == 0) {
                next
            }

            # Compare with information from VCF file
            readPossVCF = as.character(annotation(SNPhood.o)$genotype$external[regionCur, c("alleleRef","alleleAlt")])

            if (all(is.na(readPossVCF)) | all(is.na(readDerivedPresent))) {
                next
            }

            if (length(intersect(readDerivedPresent, readPossVCF)) == 0) {
                nMismatchesFile  = nMismatchesFile  + 1
                nMismatchesTotal = nMismatchesTotal + 1
                mismatches.l[[fileCur]] = c(mismatches.l[[fileCur]], regionCur)

                # Flag region
                SNPhood.o@annotation$genotype$external$genotypeMismatch[regionCur] = TRUE
            }

        }

        if (verbose) message("Found ",nMismatchesFile, " mismatches between the genotypes from the BAM file reads and the VCF information for sample ",fileCur)

    }

    if (verbose) message("Found ",nMismatchesTotal, " mismatches between the genotypes from the BAM file reads and the VCF information across all files")


    SNPhood.o@annotation$genotype$mismatches = mismatches.l

    return(SNPhood.o)

}



#' @import checkmate
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom VariantAnnotation scanVcfHeader ScanVcfParam readVcf geno alt ref samples
#' @import Rsamtools
# @importFrom Rsamtools bgzip indexTabix TabixFile
#' @import GenomicRanges
#' @importFrom SummarizedExperiment rowRanges

# #' @importFrom Biostrings unlist
# TODO: remove the object as argument to prevent copying
.extractGenotypesVCF <- function(SNPhood.o,
                                 file_vcf,
                                 sampleNamesObject,
                                 sampleNamesFile,
                                 yieldSize = NULL,
                                 verbose = TRUE) {

    # Check types and validity of arguments

    assertFileExists(file_vcf, access = "r")
    assertCharacter(sampleNamesObject, any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
    assertCharacter(sampleNamesFile  , any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
    assert(checkNull(yieldSize),
           checkIntegerish(yieldSize, lower = 100, len = 1)
    )

    if (is.null(yieldSize)) {
        yieldSizePar = NA_integer_
    } else {
        yieldSizePar = yieldSize
    }

    genotypes.l = list()

    # Either an existing uncompressed or gz-compressed file
    compressVcf <- bgzip(file_vcf, tempfile())
    idx <- indexTabix(compressVcf, "vcf")
    tab <- TabixFile(compressVcf, index = idx, yieldSize = yieldSizePar)

    # Scan the header first and get a list of sample names, and only parse those that intersect with the names in the SNPhood object
    hdr = scanVcfHeader(file_vcf)

    samplesToExtract = intersect(samples(hdr), unique(sampleNamesFile))
    samplesMissing = setdiff(unique(sampleNamesFile), samples(hdr))

    if (length(samplesToExtract) == 0) {
        stop("No common sample names found between the provided sample names for the SNPhood samples (",unique(sampleNamesFile),") and the samples defined in the file ",
             file_vcf, "(", paste0(samples(hdr),collapse = ","),")")
    }

    if (length(samplesToExtract) < length(unique(sampleNamesFile))) {
        warning("Could not extract genotypes for all samples. The following samples were not found in the file ", file_vcf, ": ",paste0(samplesMissing, collapse = ","))
    }

    if (verbose) message(" Common samples found in file ", file_vcf,". Extract genotypes for samples ",paste0(samplesToExtract, collapse = ","),".")

    # Replace "chr" to make sure that the chromosome names are identical by parsing the first lines and check
    # TODO: make this more general and determine if necessary automatically

    SNPs.gr = .getSNPGRangesObj(annotation(SNPhood.o))
    seqlevels(SNPs.gr) <- sub("chr", "", seqlevels(SNPs.gr))


    # Make the parsing as fast as possible. Only read in the GT field (genotype), and skip all info fields
    param = ScanVcfParam(
        fixed = c("ALT"),
        info = NA,
        geno = "GT",
        samples = samplesToExtract,
        trimEmpty = TRUE,
        which = SNPs.gr
    )


    # Parse VCF file
    vcf <- readVcf(tab, genome = parameters(SNPhood.o)$assemblyVersion, param = param)

    if (length(rowRanges(vcf)) == 0) {
        warning("Could not find genotype information for any of the SNPs.")
        return(SNPhood.o)
    }

    if (length(geno(vcf)) == 0 | !"GT" %in% names(geno(vcf))) {
        stop("Could not find or extract genotype information from the file ",file_vcf,". Does the file comply to the VCF standard and contains the required header lines?")
    }

    if (length(which(!mcols(rowRanges(vcf))$paramRangeID %in% names(SNPs.gr)) == FALSE) > 0) {
        warning("Note: At least one SNP ID in the VCF file differs from the SNP ID in the SNPhood object despite identical positions.")
    }


    if (verbose) message(" Found and imported ",length(rowRanges(vcf))," SNPs with genotype information across all samples out of all ",nRegions(SNPhood.o), " SNPs as defined in the SNPhood object.")


    # How does it work with yieldSize and the loop, use it at all or not necessary?
    #     open(tab)
    #     while (nrow(vcf_yield <- readVcf(tab, genome=parameters(SNPhood.o)$assemblyVersion, param=param)))
    #         cat("vcf dim:", dim(vcf_yield), "\n")
    #     close(tab)


    assertInteger(length(rowRanges(vcf)), lower = 1, len = 1, any.missing = FALSE)

    # Get a mapping of which SNP the entries correspond to
    mapping = sapply(seq_len(length(rowRanges(vcf))), function(x) {which(start(rowRanges(vcf))[x] == start(SNPs.gr))})

    ref.vec = rep(NA, length(ref(vcf)))
    ref.vec[mapping] = as.character(ref(vcf))

    not.na.index = which(!is.na(ref.vec))

    # If the REF allele is NA in the object, it has not yet been assigned.
    indexInit = which(is.na(SNPhood.o@annotation$genotype$external$alleleRef[mapping]))
    SNPhood.o@annotation$genotype$external$alleleRef[mapping[indexInit]] = ref.vec[mapping][indexInit]

    indexToTest = intersect(which(!is.na(SNPhood.o@annotation$genotype$external$alleleRef)), not.na.index)
    if (length(indexToTest) > 0) {
        indexMismatch = which(ref.vec[indexToTest] != SNPhood.o@annotation$genotype$external$alleleRef[indexToTest])

        # Genotype mismatch found, set to NA and mark as mismatch
        if (length(indexMismatch) > 0) {
            SNPhood.o@annotation$genotype$external$genotypeMismatch[indexToTest[indexMismatch]] = TRUE
            SNPhood.o@annotation$genotype$external$alleleRef[indexToTest[indexMismatch]] = NA

            warning("For ", length(indexMismatch), " SNPs (",paste0(rownames(SNPhood.o@annotation$genotype$external)[indexMismatch],collapse = ","), "), a genotype mismatch for the reference allele was found as compared to previously parsed VCF files. The reference allele has been set to NA for these SNPs, and they have been marked in the column genotypeMismatch")
        }

    }

    alt.vec =  rep(NA, length(alt(vcf)))
    alt.vec [mapping] = as.character(unlist(sapply(seq_len(length(alt(vcf))), function(x) {paste0(as.character(unlist(alt(vcf)[x])), collapse = ",")})))
    alt.vec  = unlist(alt.vec)

    not.na.index = which(!is.na(alt.vec))

    # If the ALL allele is NA in the object, it has not yet been assigned.
    indexInit = which(is.na(SNPhood.o@annotation$genotype$external$alleleAlt[mapping]))
    SNPhood.o@annotation$genotype$external$alleleAlt[mapping[indexInit]] = alt.vec[mapping][indexInit]

    indexToTest = intersect(which(!is.na(SNPhood.o@annotation$genotype$external$alleleAlt)), not.na.index)
    if (length(indexToTest) > 0) {

        # Get a list of all ALT alleles per SNP
        alleles.alt.l = strsplit(SNPhood.o@annotation$genotype$external$alleleAlt[indexToTest],",")

        # Append the new ones and make them unique.
        for (i in seq_len(length(indexToTest))) {
            alleles.alt.l[[i]] = unique(c( alleles.alt.l[[i]], unlist(strsplit(alt.vec[indexToTest[i]],",")) ))
        }

        # Convert back
        SNPhood.o@annotation$genotype$external$alleleAlt[indexToTest] =  sapply(alleles.alt.l,paste0,collapse = ",")

    }



    assertSubset(SNPhood.o@annotation$genotype$external$alleleRef, c("A","C","G","T",NA))
    assertSubset(SNPhood.o@annotation$genotype$external$alleleAlt, c("A","C","G","T",NA))

    for (i in seq_len(length(samplesToExtract))) {
        colIndex = which(colnames(geno(vcf)$GT) == samplesToExtract[i])
        colname = paste0(file_vcf, ":", samplesToExtract[i])
        assertIntegerish(colIndex, lower = 1, upper = length(colnames(geno(vcf)$GT)), any.missing = FALSE)

        # Add a new column, initially all NA
        SNPhood.o@annotation$genotype$external[,colname] = NA

        # Only change for the SNPs we have genotypes for
        SNPhood.o@annotation$genotype$external[mapping, colname] =  geno(vcf)$GT[, colIndex]

        # Set to NA if "."
        SNPhood.o@annotation$genotype$external[which(SNPhood.o@annotation$genotype$external[colname] == "."), colname] = NA
    }


    # TODO: genotypes.l

    SNPhood.o
}


#' @import checkmate
.checkAndConvertRegionArgument <- function(SNPhood.o,
                                           regions,
                                           nullAllowed = TRUE,
                                           maxLength = NULL) {

    if (testNull(maxLength)) maxLength = nRegions(SNPhood.o)

    minLength = ifelse(nullAllowed, 0, 1)

    assert(checkNull(regions),
           checkCharacter(regions, any.missing = FALSE, min.chars = 1, min.len = minLength, max.len = maxLength, unique = TRUE),
           checkIntegerish(regions, lower = 1, upper = nRegions(SNPhood.o),
                           min.len = minLength, max.len = maxLength, unique = TRUE)
    )

    if (!nullAllowed) {
        assert(!testNull(regions))
    }

    # Check if names or indizes are provided, and convert to row numbers
    if (testCharacter(regions)) {
        assertSubset(regions, annotationRegions(SNPhood.o))
        regions = which(annotationRegions(SNPhood.o) %in% regions)
    }

    regions

}

#' @import checkmate
.checkAndConvertReadGroupArgument <- function(SNPhood.o,
                                              readGroup) {


    if (nReadGroups(SNPhood.o) == 1) {
        assert(checkNull(readGroup),
               checkChoice(readGroup, annotationReadGroups(SNPhood.o)))

        if (testNull(readGroup)) {
            readGroup = annotationReadGroups(SNPhood.o)
        }

    } else {
        assertChoice(readGroup, annotationReadGroups(SNPhood.o))

    }

    readGroup

}

#' @import checkmate
.checkAndConvertDatasetArgument <- function(SNPhood.o,
                                            datasets,
                                            nullAllowed = TRUE,
                                            maxLength = NULL,
                                            returnNames = TRUE) {

    if (testNull(maxLength)) maxLength = nDatasets(SNPhood.o)

    minLength = ifelse(nullAllowed, 0, 1)

    assert(checkNull(datasets),
           checkCharacter(datasets, any.missing = FALSE, min.chars = 1, min.len = minLength, max.len = maxLength, unique = TRUE),
           checkIntegerish(datasets, lower = 1, upper = nDatasets(SNPhood.o),
                           min.len = minLength, max.len = maxLength, unique = TRUE)
    )

    if (!nullAllowed) {
        assert(!testNull(datasets))
    }

    # Check if names or indizes are provided, and convert to row numbers
    if (testCharacter(datasets)) {
        checkSubset(datasets, annotationDatasets(SNPhood.o))
        datasets = which(annotationDatasets(SNPhood.o) %in% datasets)
    }

    if (returnNames) {
        datasets = annotationDatasets(SNPhood.o)[datasets]
    }

    datasets

}

.getErrorForOnlyPrepareSamplesCorrelation <- function() {
    paste0('The parameter "onlyPrepareForDatasetCorrelation" in the main function "analyzeSNPhood" has been set to TRUE.',
           ' Cannot execute the function because of this. Run the function "analyzeSNPhood" again and set "onlyPrepareForDatasetCorrelation" to FALSE (the default).')
}


.getListOfSupportedParameters  <-  function() {

    list(
        readFlag_isPaired                    = "logical",
        readFlag_isProperPair                = "logical",
        readFlag_isUnmappedQuery             = "logical",
        readFlag_hasUnmappedMate             = "logical",
        readFlag_isMinusStrand               = "logical",
        readFlag_isMateMinusStrand           = "logical",
        readFlag_isFirstMateRead             = "logical",
        readFlag_isSecondMateRead            = "logical",
        readFlag_isNotPrimaryRead            = "logical",
        readFlag_isNotPassingQualityControls = "logical",
        readFlag_isDuplicate                 = "logical",
        readFlag_reverseComplement           = "logical",
        readFlag_simpleCigar                 = "logical",
        path_userRegions                     = "character",
        zeroBasedCoordinates                 = "logical",
        regionSize                           = "integer",
        binSize                              = "integer",
        readGroupSpecific                       = "logical",
        strand                               = "character",
        startOpen                            = "logical",
        endOpen                              = "logical",
        headerLine                           = "logical",
        linesToParse                         = "integer",
        lastBinTreatment                     = "character",
        assemblyVersion                      = "character",
        nCores                               = "integer",
        keepAllReadCounts                    = "logical",
        normByInput                          = "logical",
        normAmongEachOther                   = "logical",
        poolDatasets                         = "logical"
    )
}


.initSNPhoodObject <- function(SNPhood.o,
                               files.df,
                               readGroups,
                               onlyPrepareForDatasetCorrelation) {

    par.l = parameters(SNPhood.o)
    nRegionsCur = nRegions(SNPhood.o)

    stopifnot(!SNPhood.o@internal$isInitialized)

    assertCharacter(readGroups, min.len = 1, min.chars = 1)

    SNPhood.o@annotation$readGroups = readGroups

    for (alleleCur in readGroups) {

        SNPhood.o@readCountsUnbinned[[alleleCur]] = list()
        SNPhood.o@readCountsBinned[[alleleCur]] = list()
        SNPhood.o@internal$sizeFactors[[alleleCur]] = list()
        SNPhood.o@enrichmentBinned[[alleleCur]] = list()
        SNPhood.o@annotation$genotype$readsDerived [[alleleCur]]  = list()
        SNPhood.o@internal$readStartPos[[alleleCur]] = list()
        SNPhood.o@internal$readWidth[[alleleCur]] = list()


        # Iterate over all unique individual input files and signal files
        if (!all(is.na(files.df$input))) {
            inputFiles.vec  = c(unique(files.df$input), unique(unlist(strsplit(files.df$input, ",", fixed = TRUE))))
        } else {
            inputFiles.vec = c()
        }

        signalFiles.vec = unique(c(files.df$signal, files.df$individual))

        # Init the matrices
        for (filesCur in c(inputFiles.vec, signalFiles.vec)) {

            if (is.na(filesCur)) {
                next
            }

            SNPhood.o@readCountsUnbinned[[alleleCur]] [[filesCur]] = rep(0, nRegionsCur)

            # Some slots are only needed for signal files and not input files
            if (filesCur %in% signalFiles.vec) {
                # Moved to .extractAndNormalize
                #SNPhood.o@annotation$genotype$readsDerived[[alleleCur]] [[filesCur]] = array(data = rep(0,4), dim = c(4, nRegionsCur), dimnames = list(c("A", "C", "G", "T")))

                if (nReadGroups(SNPhood.o) > 1) {
                    SNPhood.o@internal$readStartPos[[alleleCur]][[filesCur]] = vector("list", nRegionsCur)
                    SNPhood.o@internal$readWidth   [[alleleCur]][[filesCur]] = vector("list", nRegionsCur)
                } else {
                    SNPhood.o@internal$readStartPos[[alleleCur]][[filesCur]] = NULL
                    SNPhood.o@internal$readWidth   [[alleleCur]][[filesCur]] = NULL
                }
            }



            if (!onlyPrepareForDatasetCorrelation) {
                SNPhood.o@readCountsBinned[[alleleCur]] [[filesCur]] = matrix(0, nrow = nRegionsCur, ncol = nBins(SNPhood.o))
            }

        }


        # Enrichment matrices are only stored for the final individuals, not the individual files
        if (!onlyPrepareForDatasetCorrelation) {

            if (par.l$normByInput) {

                for (filesCur in unique(files.df$individual)) {
                    SNPhood.o@enrichmentBinned[[alleleCur]] [[filesCur]] = matrix(0, nrow = nRegionsCur, ncol = nBins(SNPhood.o))
                }
            }
        }




    } # end for (alleleCur in readGroups)

    SNPhood.o@internal$isInitialized = TRUE

    SNPhood.o
}

#' @importFrom methods new
.createSNPhoodObject <- function(par.l,
                                 onlyPrepareForDatasetCorrelation) {

    SNPhood.o  <-  new("SNPhood",
                       annotation        = list(regions = NULL,
                                                genotype = list(readsDerived = list()),
                                                files = list(),
                                                readGroups = NA
                       ),
                       config            = par.l,
                       enrichmentBinned   = list(),
                       readCountsUnbinned = list(),
                       readCountsBinned    = list(),
                       internal             = list(disableObjectIntegrityChecking = TRUE,
                                                   isAllelicRatio = FALSE ,
                                                   mergedReadGroups = FALSE,
                                                   addResultsElementsAdded = character(),
                                                   readStartPos = list(),
                                                   readWidth = list(),
                                                   sizeFactors = list(),
                                                   globalBackground = list(),
                                                   countType = NA,
                                                   plot_origBinSNPPosition = NA,
                                                   plot_labelBins = NA,
                                                   isInitialized = FALSE
                       ),
                       additionalResults   = list()
    )

    SNPhood.o
}


.BAMHeaderConsistencyChecks <- function(par.l,
                                        annotationRegions,
                                        signalFileCur,
                                        expectedReadGroups = NULL,
                                        verbose = TRUE) {

    assertCharacter(signalFileCur, any.missing = FALSE)
    assertFileExists(signalFileCur, access = "r")
    assertFlag(verbose)

    chrSizes.df = .getGenomeData(par.l$assemblyVersion, includeChrM = TRUE)


    # Check if the index file is present for all BAM files. Use of which in the ScanBamParam function requires that a BAM index file (<filename>.bai) exists.
    # Indexing provides two significant benefits. First, an index allows a BAM file to be efficiently accessed by range. A corollary is that providing a which argument to ScanBamParam requires an index. Second, coordinates for extracting information from a BAM file can be derived from the index, so a portion of a remote BAM file can be retrieved with local access only to the index.
    .checkAndCreateIndexFile(signalFileCur)

    if (verbose) message(" Parse BAM header...")

    # Examine the BAM file header to extract the sequence names and read groups
    res = scanBamHeader(signalFileCur)
    chrSizes = res[[signalFileCur]]$targets

    if (length(chrSizes)  ==  0) {
        stop("The chromosome sizes are not specified in the BAM file ", signalFileCur,". No automated verification of the correct genome assembly version can be done. Carefully check if the genome asselbly version has been specified correctly.", sep = "")
    }

    definedChr = unique(as.character(seqnames(annotationRegions)))
    unknownChr = which(!names(chrSizes) %in% definedChr)
    if (length(unknownChr) > 0) {
        chrSizes = chrSizes[-unknownChr]
    }

    # Consistency checks for all chromosomes from the user regions
    for (chrCur in definedChr) {

        # Check if defined in BAM header
        if (!chrCur %in% names(chrSizes)) {
            stop("Chromosome ", chrCur, "(from the user-defined regions) not defined in BAM file ", signalFileCur)
        }

        # Check if we have the chromosome length available in the chrSizes.df object
        if (!chrCur %in% rownames(chrSizes.df)) {
            stop("Could not find ", chrCur," in automatically retrieved list of chromosome lengths. Only chromosomes ", paste0(rownames(chrSizes.df), collapse = ","), " are available. Please remove all references to chromosome ", chrCur, " in the BAM file.", sep = "")
        }

        # Check if size is identical to what has been automatically retrieved
        if (chrSizes[which(names(chrSizes) == chrCur)] != chrSizes.df[chrCur,]$size) {
            stop("A mismatch was detected for the chromosome size of ", chrCur," (BAM file: ", chrSizes[which(names(chrSizes) == chrCur)] , ", specified genome assembly: ", chrSizes.df[chrCur,]$size,"). ", sep = "")
        }

    }



    # Read groups
    readGroupSpecificityTmp = FALSE
    readGroups = "allReadGroups"


    if (par.l$readGroupSpecific) readGroupSpecificityTmp = TRUE

    if (readGroupSpecificityTmp) {

        indexes = which(grepl("@RG", names(res[[signalFileCur]]$text)))

        readGroups = as.character(sapply(res[[signalFileCur]]$text[indexes], function(x) {gsub("ID:","",x[[1]])}))

        if (verbose & testNull(expectedReadGroups)) message("  Read group specific reporting has been requested. The following read groups have been identified in the BAM header: ", paste(readGroups, collapse = ", " ))

        # If we don't find at least two read groups there, set readGroupSpecificity to FALSE

        # Check if the same set of read groups is defined in the files
        if (!testNull(expectedReadGroups)) {

            if (!identical(sort(readGroups), sort(expectedReadGroups))) {
                stop("The read groups as defined in the file ", signalFileCur, " are different from the ones in previous files. Expected ", paste0(expectedReadGroups, collapse = ","),", found: ", paste0(readGroups, collapse = ","))
            }
        }



        if (length(unique(readGroups)) < 2) {
            readGroupSpecificityTmp = FALSE
            readGroups = "allReadGroups"
            warning("Although it was requested to derive read count statistics read group-specifically, the BAM header does not contain at least two different read group definitions. Consequently, read group-specific reporting has been disabled. ")
        }

    }

    list("readGroupSpecific" = readGroupSpecificityTmp, "readGroups" = readGroups)
}


#' @import checkmate
#' @importFrom Biostrings nucleotideFrequencyAt subseq
#' @import GenomicRanges
# @importFrom GenomicRanges strand start
.extractAndNormalize <- function(par.l,
                                 annotationRegions,
                                 readGroups,
                                 SNPPos.gr,
                                 files.vec,
                                 individualName,
                                 bins.l,
                                 type,
                                 verbose = TRUE) {

    # TODO: Potential optimization: Save results of individual files so they don't have to be parsed multiple times

    assertVector(files.vec, min.len = 1)
    assertChoice(type, c("signal", "input"))
    assertFlag(verbose)


    if (type == "input") {
        avgBackground.l         = list()
        avgBackgroundOld.l      = list()
    }

    nRegionsCur = length(annotationRegions)

    readCountsRegions.l     = list()
    readCountsBins.l        = list()

    # Init final result list
    results.l = list(globalBackground   = NULL,
                     genotypes          = NULL,
                     readStartPos       = NULL,
                     readWidth          = NULL,
                     readCountsBinned   = NULL,
                     readCountsUnbinned = NULL
    )

    # Can be ignored for input
    if (type == "signal")  {

        for (alleleCur in readGroups) {

            results.l$genotypes[[alleleCur]] = array(data = rep(0,4), dim = c(4, nRegionsCur), dimnames = list(c("A", "C", "G", "T")))

            if (length(readGroups) > 1) {
                results.l$readStartPos[[alleleCur]] = vector("list", nRegionsCur)
                results.l$readWidth   [[alleleCur]] = vector("list", nRegionsCur)
            } else {
                results.l$readStartPos[[alleleCur]] = NULL
                results.l$readWidth   [[alleleCur]] = NULL
            }
        }
    }


    nFilesProcessed = 0

    for (fileCur in files.vec) {

        assertCharacter(fileCur, any.missing = FALSE)
        assertFileExists(fileCur, access = "r")

        nFilesProcessed = nFilesProcessed + 1
        if (verbose) message("\nPROCESS FILE ", nFilesProcessed, " of ", length(files.vec), ":", fileCur)

        .checkAndCreateIndexFile(fileCur)

        ##################
        # Various checks #
        ##################

        expectedReadGroups = readGroups

        # Only check read group consistency among signal files
        if (type == "input") expectedReadGroups = NULL
        res.l = .BAMHeaderConsistencyChecks(par.l, annotationRegions, fileCur, expectedReadGroups, verbose = TRUE)


        ################
        # Get genotype #
        ################
        if (type == "signal") {

            start.timeTemp  <-  Sys.time()
            if (!par.l$onlyPrepareForDatasetCorrelation) {

                if (verbose) message(" Extract data from BAM file ", fileCur," for the SNPs only (this may take a while)...")
                bamSignalSNP = .extractFromBAM(SNPPos.gr, fileCur, .getFieldsForBAMParsing("SNPs"), par.l,  verbose)


                .callCalculateOverlapsReads <- function(x, bamSignal_filteredCurSNP, SNPPos.gr) {
                    suppressWarnings(
                        nucleotideFrequencyAt(
                            subseq(bamSignal_filteredCurSNP[[x]]$seq, start(SNPPos.gr)[x] - bamSignal_filteredCurSNP[[x]]$pos + 1, width = 1), 1)
                    )
                }

                if (verbose) message(" Determine genotype distribution at original user positions for A, C, G, and T...")



                for (alleleCur in readGroups) {
                    if (verbose) message("  Read group ",alleleCur)


                    res.l = .filterReads(bamSignalSNP, as.character(strand(SNPPos.gr)), alleleCur, par.l$strand)
                    bamSignal_filteredCurSNP = res.l$bamObj

                    # Suppress warnings so that N bases are simply not counted but don't confuse the user

                    readDist.l  = .execInParallelGen(nCores = par.l$nCores,
                                                     returnAsList = TRUE,
                                                     listNames = NULL,
                                                     iteration = seq_len(length(bamSignal_filteredCurSNP)),
                                                     abortIfErrorParallel = TRUE,
                                                     verbose = FALSE,
                                                     functionName = .callCalculateOverlapsReads, bamSignal_filteredCurSNP, SNPPos.gr)

                    results.l$genotypes[[alleleCur]] = results.l$genotypes[[alleleCur]] + unlist(readDist.l, use.names = FALSE)

                }


            }
            .printExecutionTime(start.timeTemp, verbose)

        }

        #################
        # Extract reads #
        #################
        if (verbose) message(" Extract data from BAM file ", fileCur," for the full SNP regions (this may take a while)...")
        bamFile = .extractFromBAM(annotationRegions, fileCur, .getFieldsForBAMParsing("regions"), par.l,  verbose)

        # Determine the read numbers per region before filtering by strand
        nReads.vec = unlist(lapply(bamFile, function(x) length((x$pos))), use.names = FALSE)

        nReadsSumAll = sum(nReads.vec)

        if (nReadsSumAll  ==  0) warning("  No reads were found in file ", fileCur," that overlap the user regions. This may also happen if the values of the various readFlag_* parameters have been specified too strictly or incorrectly. For example, if you have single-end data, make sure you specified this when generating the default parameters list.")

        #############################################################################
        # Filter reads and evaluate read counts per bin specifically per read group #
        #############################################################################

        if (verbose &  par.l$readGroupSpecific & type == "signal") message(" Analyze read counts specifically for each read group (this may take a while)...")
        if (verbose & !par.l$readGroupSpecific & type == "signal") message(" Analyze read counts (this may take a while)...")

        for (alleleCur in readGroups) {

            if (testNull(readCountsRegions.l[[alleleCur]])) {
                readCountsRegions.l[[alleleCur]] = list()
                readCountsBins.l   [[alleleCur]] = list()
            }

            if (verbose & par.l$readGroupSpecific) message("  Read group ", alleleCur, "...")


            res.l = .filterReads(bamFile, as.character(strand(annotationRegions)), alleleCur, par.l$strand)
            nFilteredReadsSignal  = res.l$filteredReads
            bamFile_filteredCur = res.l$bamObj

            if (verbose & nFilteredReadsSignal > 0) message("   Filtered ", nFilteredReadsSignal," reads out of ", nReadsSumAll," in file ", fileCur," because of strand and read group")

            nReads.vec = unlist(lapply(bamFile_filteredCur, function(x) length((x$pos))), use.names = FALSE)
            nReadsSum = sum(nReads.vec)
            if (nReadsSum  ==  0) warning("   No reads were found in file ", fileCur," that overlap the user regions for the read group ", alleleCur,".")


            # Determine the read numbers per region
            readCountsRegions.l[[alleleCur]] [[fileCur]] = nReads.vec

            if (par.l$onlyPrepareForDatasetCorrelation) next

            if (type == "signal") {

                # Save the start positions and pool them if necessary
                # Only necessary if the number of read groups is at least 2
                if (length(readGroups) > 1) {

                    iterationBam = seq_len(length(bamFile_filteredCur))

                    results.l$readStartPos[[alleleCur]] =
                        lapply(iterationBam, function(x) {c(results.l$readStartPos[[alleleCur]][[x]], bamFile_filteredCur[[x]]$pos)})

                    results.l$readWidth[[alleleCur]] =
                        lapply(iterationBam, function(x) {c(results.l$readWidth[[alleleCur]][[x]], bamFile_filteredCur[[x]]$qwidth)})

                }
            }


            # Calculate read coverage per bin for the input and the current read group
            if (verbose) message("   Analyze read counts per bin (this may take a while)...")
            readCountsBins.l[[alleleCur]] [[fileCur]]  = .calculateOverlapsBin(bamFile_filteredCur, par.l, bins.l, readCountsRegions.l[[alleleCur]] [[fileCur]], verbose)


            ####################################
            # Calculate genome-wide background #
            ####################################
            if (type == "input") {
                avgBackground.l[[fileCur]] = .calculateGenomeWideBackground(fileCur, par.l, nReadsToCheck = 1000, verbose = TRUE)

                # Compute the average number of counts per bin across all input files that belong to the particular individual
                avgBackgroundOld.l[[fileCur]]   = mean(readCountsBins.l[[alleleCur]] [[fileCur]])
                if (verbose) message("Global average background across bins (old, just for comparison): ",  avgBackgroundOld.l[[fileCur]])
            }


            rm(bamFile_filteredCur)
        }

        # Delete large object and explicitly call the garbage collector because bamSignal may be a large object
        rm(bamFile)


    } #end for all files



    #########################################
    # Normalize libraries and add up counts #
    #########################################

    if (nFilesProcessed > 1) {

        if (verbose) message(" Normalize library sizes...")

        # Determine the normalization factors independent of the read groups. For this, sum up the counts for all read groups
        counts.m = matrix(0, ncol = nFilesProcessed, nrow = nRegionsCur)

        for (alleleCur in readGroups) {

            for (i in seq_len(length(files.vec))) {
                counts.m[, i] = counts.m[, i] + readCountsRegions.l[[alleleCur]] [[files.vec[i]]]
            }
        }

        stopifnot(all(counts.m >= 0))

        colnames(counts.m) = files.vec
        res.l = .scaleLibraries(counts.m)
        sizeFactors.vec         = res.l$sizeFactors


        for (fileCur in files.vec) {

            stopifnot(!testNull(sizeFactors.vec[fileCur]))

            for (alleleCur in readGroups) {

                readCountsRegions.l[[alleleCur]] [[fileCur]] = readCountsRegions.l[[alleleCur]] [[fileCur]] / sizeFactors.vec[fileCur]

                if (par.l$onlyPrepareForDatasetCorrelation) next

                readCountsBins.l   [[alleleCur]] [[fileCur]] = readCountsBins.l   [[alleleCur]] [[fileCur]] / sizeFactors.vec[fileCur]

                if (type == "input") {
                    avgBackground.l[[fileCur]]     = avgBackground.l[[fileCur]]     / sizeFactors.vec[fileCur]
                    avgBackgroundOld.l[[fileCur]]  = avgBackgroundOld.l[[fileCur]]  / sizeFactors.vec[fileCur]
                }
            }


        }

    } # end if (nFilesProcessed > 1)

    #####################
    # Adjust background #
    #####################
    if (type == "input") {
        # Compute the mean average background.
        avgBackgroundAll = mean(unlist(avgBackground.l, use.names = FALSE))
        #avgBackgroundOldAll = mean(unlist(avgBackgroundOld.l))

        results.l$globalBackground = avgBackgroundAll
    }


    ################################
    # Sum up the normalized counts #
    ################################

    for (alleleCur in readGroups) {

        # Round to 0 places after the comma to allow potential subsequent normalization
        results.l$readCountsUnbinned[[alleleCur]] = round(Reduce("+", readCountsRegions.l[[alleleCur]]), 0)

        if (par.l$onlyPrepareForDatasetCorrelation) next

        results.l$readCountsBinned[[alleleCur]]   = round(Reduce("+", readCountsBins.l   [[alleleCur]]), 0)

    }


    results.l

}


# Important: Append the size factors because both signal and input may be used multiple times with other signals and input files, respectively, each of which translates to a different size factor
# Change the name so that it can be traced back to the corresponding signal data file

# Switch so that the mapping is composed of a signal and the corresponding input datafile always

# Save the size factor. Reverse the names here to make the mapping properly. This is correct and verified!
#names(sizeFactors.vec) = rev(names(sizeFactors.vec))
#SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]   = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]  , sizeFactors.vec[inputFileSetCur])
#SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]], sizeFactors.vec[individualCur])


.determineHeterozygosity <- function (SNPhood.o,
                                      readGroupsToTest =  annotationReadGroups(SNPhood.o),
                                      takeOnlyMostAbundantOnePerReadGroup = TRUE,
                                      verbose = TRUE) {

    assertFlag(verbose)
    assertSubset(readGroupsToTest, annotationReadGroups(SNPhood.o))
    assertFlag(takeOnlyMostAbundantOnePerReadGroup)

    heterozygous.m = matrix(NA, ncol = nDatasets(SNPhood.o) + 1, nrow = nRegions(SNPhood.o))
    dimnames(heterozygous.m) = list(annotationRegions(SNPhood.o), c(annotationDatasets(SNPhood.o),"shared"))

    for (i in 1:nDatasets(SNPhood.o)) {

        datasetCur = annotationDatasets(SNPhood.o)[i]

        for (j in seq_len(nRegions(SNPhood.o))) {

            genotypeDist.vec = c()
            isNA = FALSE

            for (readGroupCur in readGroupsToTest) {

                distCur = SNPhood.o@annotation$genotype$readsDerived[[readGroupCur]][[datasetCur]][,j]

                indexGenotypesPresent = which(distCur > 0)

                if (length(indexGenotypesPresent) > 0) {

                    genotypesPresent = sort(indexGenotypesPresent, decreasing = TRUE)

                    if (takeOnlyMostAbundantOnePerReadGroup) {
                        genotypeDist.vec = c(genotypeDist.vec, names(genotypesPresent)[1])
                    } else {
                        genotypeDist.vec = c(genotypeDist.vec, names(genotypesPresent))
                    }
                } else {
                    heterozygous.m[j, i] = NA
                    isNA = TRUE
                    break
                }

            }

            # Take most abundant genotype in each read group and check their identity
            indexBasePairs = which(genotypeDist.vec %in% c("A", "C", "G", "T"))
            if (!isNA) {
                if (length(indexBasePairs) > 1) {
                    heterozygous.m[j, i] = TRUE
                } else {
                    heterozygous.m[j, i] = FALSE
                }
            }


        }

    }

    heterozygous.m[,ncol(heterozygous.m)] = rowSums(heterozygous.m, na.rm = TRUE) == (ncol(heterozygous.m) - 1)

    SNPhood.o@annotation$genotype$heterozygosity = heterozygous.m


    SNPhood.o

}

Try the SNPhood package in your browser

Any scripts or data that you put into this service are public.

SNPhood documentation built on Nov. 8, 2020, 6:22 p.m.