Nothing
#' Helper function to generate a data frame that can be used as input for the function \emph{analyzeSNPhood}
#'
#' \code{collectFiles} creates a data frame that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' The resulting data frame contains information about files that will be processed (column signal) and, optionally,
#' corresponding input files for normalization (column input) and labels to combine datasets to meta-datasets (column individual).
#'
#' Note that if you already have an object of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}}, this can easily be
#' integrated into the \code{SNPhood} framework by using the \code{\link[Rsamtools]{path}} function to specify the
#' value of the parameter \code{patternFiles}, see the examples below.
#'
#' @param patternFiles Character. If vector of length 1, absolute path to one or multiple BAM files that should be processed. Wildcards ("*") are allowed
#' (examples are *CTCF* or *.bam, see also examples). If vector of length > 1, each element must specify the absolute path to a BAM file,
#' with no wildcards being allowed. See also the note above concerning the integration of
#' \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}} objects.
#' For more details, see the examples and the vignette.
#' @param recursive Logical(1). Default FALSE. Should the search for BAM files within the directory be performed recursively?
#' If set to TRUE, all files matching the pattern within the specified directory and all of its subdirectories will be added.
#' If set to FALSE, only files within the specified directory but not any subdirectories will be used.
#' @param ignoreCase Logical(1). Default TRUE. Should the specified pattern be case sensitive?
#' @param inputFiles Character. Default NULL. Input files that should be used as a control for normalization.
#' Supported values are NA (no input normalization), a single character specifying one or multiple input files (comma-separated, see examples)
#' that should be used for all processed files, or a character vector of the same length as the number of files that will be processed.
#' Set to NULL if you want to add the files later manually in the data frame (see vignette).
#' @param individualID Character. Default NULL. Name of the individual IDs. Only relevant if datasets should be pooled.
#' @param genotypeMapping Character. Default NULL. Path to the corresponding genotype file in VCF format, followed by a colon and the name of the column in the VCF file.
#' Genotypes can also be integrated later using the function \code{\link{associateGenotypes}}
#' @template verbose_TRUE
#'
#' @return a data frame with the three columns \code{signal}, \code{input} and \code{individual} that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' @examples
#' ## For brevity, only exemplary filenames are given in the following.
#' ## Note that in reality, absolute paths should be provided.
#' ## First some examples using specific files rather than files that
#' ## match a pattern in a particular directory
#'
#' ## Load SNPhoodData library
#' library(SNPhoodData)
#' files.df = collectFiles(patternFiles = paste0(system.file("extdata", package = "SNPhoodData"),"/*.bam"))
#'
#' ## If you already have BAM files in objects of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}},
#' ## you may use the following code snippet:
#' files = list.files(pattern = "*.bam$",system.file("extdata", package = "SNPhoodData"),full.names = TRUE)
#' BamFile.o = BamFile(files[1])
#' BamFiles.o = BamFileList(files)
#' files.df = collectFiles(patternFiles = path(BamFile.o))
#' files.df = collectFiles(patternFiles = path(BamFiles.o))
#' @seealso \code{\link{analyzeSNPhood}}
#' @export
#' @import checkmate
collectFiles <- function(patternFiles,
recursive = FALSE,
ignoreCase = TRUE,
inputFiles = NA,
individualID = NA,
genotypeMapping = NA,
verbose = TRUE) {
# Check types and validity of arguments
assertCharacter(patternFiles, any.missing = FALSE, min.chars = 1, min.len = 1)
assertFlag(recursive)
assertFlag(ignoreCase)
assert(checkScalarNA(inputFiles),
checkCharacter(inputFiles, any.missing = TRUE, all.missing = TRUE, min.chars = 1, min.len = 1))
assert(checkScalarNA(individualID),
checkCharacter(individualID, any.missing = TRUE, all.missing = TRUE, min.chars = 1, min.len = 1))
assertFlag(verbose)
files.vec = .collectFilesGen(
patternFiles = patternFiles,
recursive = recursive,
ignoreCase = ignoreCase,
verbose = verbose
)
nFilesToProcess = length(files.vec)
if (nFilesToProcess == 0) {
return(data.frame(signal = character(),
input = character(),
individual = character(),
genotype = character(),
stringsAsFactors = FALSE))
}
if (length(inputFiles) != nFilesToProcess & length(inputFiles) != 1) {
stop("Length of inputFiles vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
}
if (length(individualID) != nFilesToProcess & length(individualID) != 1) {
stop("Length of individualID vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
}
if (length(genotypeMapping) != nFilesToProcess & length(genotypeMapping) != 1) {
stop("Length of genotype vector must be either 1 or identical to the length of the number of files to be processed. Set to NA?")
}
data.frame(signal = files.vec,
input = inputFiles,
individual = individualID,
genotype = genotypeMapping,
stringsAsFactors = FALSE)
}
#' Helper function to generate a default parameter list as input for the function \emph{analyzeSNPhood}
#'
#' \code{getDefaultParameterList} generates a default parameter list that can be used as input for the function \code{\link{analyzeSNPhood}}.
#' The path to the user regions file can optionally be provided as an argument to the function. See the examples for further details.
#' Before running the function \code{\link{analyzeSNPhood}}, carefully check that the default parameters are suitable for the analysis.
#'
#' @param path_userRegions Character(1). Specify the value of the parameter \code{path_userRegions}
#' (absolute path to the user regions file, see the Vignette for details).
#' @param isPairedEndData Logical(1). Default TRUE. Are the data paired-end (TRUE) or single-end (FALSE)?
#'
#' @return a named list with default values for the currently supported parameters that can be used as
#' input for the function \code{\link{analyzeSNPhood}}:
#' \itemize{
#' \item{readFlag_isPaired: Logical(1), TRUE for paired-end data, NA for single-end}
#' \item{readFlag_isProperPair: Logical(1), TRUE}
#' \item{readFlag_isUnmappedQuery: Logical(1), FALSE}
#' \item{readFlag_hasUnmappedMate: Logical(1), FALSE}
#' \item{readFlag_isMinusStrand: Logical(1), NA}
#' \item{readFlag_isMateMinusStrand: Logical(1), NA}
#' \item{readFlag_isFirstMateRead: Logical(1), NA}
#' \item{readFlag_isSecondMateRead: Logical(1), NA}
#' \item{readFlag_isNotPrimaryRead: Logical(1), FALSE}
#' \item{readFlag_isNotPassingQualityControls: Logical(1), FALSE}
#' \item{readFlag_isDuplicate: Logical(1), FALSE}
#' \item{readFlag_reverseComplement: Logical(1), FALSE}
#' \item{readFlag_simpleCigar: Logical(1), TRUE}
#' \item{path_userRegions: Character(1), as given by the function argument path_userRegions}
#' \item{zeroBasedCoordinates: Logical(1), FALSE}
#' \item{regionSize: Integer(1), 500}
#' \item{binSize: Integer(1), 50}
#' \item{readGroupSpecific: Logical(1), TRUE}
#' \item{strand: Character(1), "both"}
#' \item{startOpen: Logical(1), FALSE}
#' \item{endOpen: Logical(1), FALSE}
#' \item{headerLine: Logical(1), FALSE}
#' \item{linesToParse: Integer(1), -1}
#' \item{lastBinTreatment: Character(1), "delete"}
#' \item{assemblyVersion: Character(1), "hg19"}
#' \item{nCores: Integer(1), 1}
#' \item{keepAllReadCounts: Logical(1), FALSE}
#' \item{normByInput: Logical(1), FALSE}
#' \item{normAmongEachOther: Logical(1), TRUE}
#' \item{poolDatasets: Logical(1), FALSE}
#'
#' }
#' For reasons of reduced redundancy, a detailed description of the parameters can be found at the end of the
#' main vignette in \code{SNPhood} (\code{browseVignettes("SNPhood")}).
#' @examples
#' ## Only one parameter can, optionally, be specified when calling the function
#' par.l = getDefaultParameterList(path_userRegions = "path/to/regions", isPairedEndData = TRUE)
#' ## If the file is not specified, you need to change it
#' ## before you can execute the function \code{\link{analyzeSNPhood}}
#' par.l = getDefaultParameterList(isPairedEndData = TRUE)
#' par.l$path_userRegions = "path/to/regions"
#' @seealso \code{\link{analyzeSNPhood}}
#' @export
#' @import checkmate
getDefaultParameterList <- function(path_userRegions = NULL,
isPairedEndData = TRUE) {
assert(checkNull(path_userRegions),
checkCharacter(path_userRegions, any.missing = FALSE, min.chars = 1, len = 1)
)
assertFlag(isPairedEndData)
readFlags.l = .generateDefaultReadFlags(pairedEndReads = isPairedEndData)
others.l = list(
readFlag_reverseComplement = FALSE,
readFlag_simpleCigar = TRUE,
readFlag_minMapQ = 0,
path_userRegions = path_userRegions,
zeroBasedCoordinates = FALSE,
regionSize = 500,
binSize = 50,
readGroupSpecific = TRUE,
strand = "both",
startOpen = FALSE,
endOpen = FALSE,
headerLine = FALSE,
linesToParse = -1,
lastBinTreatment = "delete",
assemblyVersion = "hg19",
effectiveGenomeSizePercentage = -1,
nCores = 1,
keepAllReadCounts = FALSE,
normByInput = FALSE,
normAmongEachOther = TRUE,
poolDatasets = FALSE
)
c(readFlags.l, others.l)
}
.getUniqueMappabilityData <- function(genomeAssembly,
readLength,
verbose = TRUE) {
assertSubset(genomeAssembly, c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3"))
assertInt(readLength, lower = 15)
assertFlag(verbose)
# Taken from Table in Koehler, Ryan, et al. "The uniqueome: a mappability resource for short-tag sequencing." Bioinformatics 27.2 (2011): 272-274.
# For different assemblies, values are assumed to be identical
readLength.vec = c(25, 30, 35, 50, 60, 75, 90)
uniqueMappability.df = data.frame( "25" = c(rep(0.66,3), rep(0.699,2), 0.675),
"30" = c(rep(0.709,3), rep(0.744,2), 0.684),
"35" = c(rep(0.741,3), rep(0.771,2), 0.69),
"50" = c(rep(0.769,3), rep(0.791,2), 0.692),
"60" = c(rep(0.775,3), rep(0.794,2), 0.692),
"75" = c(rep(0.793,3), rep(0.807,2), 0.695),
"90" = c(rep(0.808,3), rep(0.817,2), 0.698),
stringsAsFactors = FALSE)
rownames(uniqueMappability.df) = c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3")
# Get the read length that is closest to the real one
bestApprox = readLength.vec[which(abs(readLength.vec - readLength) == min(abs(readLength.vec - readLength)))][1]
if (verbose) message(" Read length from pre-computed table that most closely resembles observed read length: ", bestApprox)
if (abs(bestApprox - readLength) > 20) {
warning("Difference between observed read length and best match in the pre-computed table is bigger than 20bp. You may want to set the value of the parameter effectiveGenomeSizePercentage manually for increased accuracy with respect to the global genome average that is used for input normalization.")
}
res = uniqueMappability.df[which(rownames(uniqueMappability.df) == genomeAssembly), which(colnames(uniqueMappability.df) == paste0("X", bestApprox))]
if (verbose) message(" Value for the effective genome size: ", res)
res
}
#' @import checkmate
.calcBinLabelsPlot <- function(nBins) {
binLabels = seq(-floor(nBins/2), floor(nBins/2), 1)
# Current design decision:
# Odd number of bins (e.g, 7)
# -3 -2 -1 0 1 2 3
# Even number of bins (e.g, 6)
# -3 -2 -1 1 2 3
if (nBins %% 2 == 0) { # Delete the 0
binLabels = binLabels[-which(binLabels == 0)]
}
names(binLabels) = 1:nBins
binLabels
}
###################################
###################################
# PARSING AND CHECKING FUNCTIONS #
###################################
###################################
#' @import checkmate
#' @importFrom BiocParallel multicoreWorkers
.checkConfigFile <- function(par.l,
verbose = TRUE) {
supportedPar.l = getDefaultParameterList(NULL)
# Check types and validity of arguments
assertList(par.l, min.len = 1, all.missing = FALSE)
# Check if all parameters are present
assertSubset(names(supportedPar.l), names(par.l))
# Check if there are any additional parameters defined that are NOT supported. If yes, abort to make sure the user didn't unintentionally make a mistake
if (!testSubset(names(par.l), names(supportedPar.l))) {
addPar = setdiff(names(par.l), names(supportedPar.l))
stop("The following parameters are defined in the parameter list but are not supported (set them to NULL to delete them): \n ", paste(addPar, collapse = ","))
}
pars.df = .getParameters(.getListOfSupportedParameters())
parsRequired.vec = pars.df$parName
parsRequiredTypes.vec = pars.df$parType
names(parsRequiredTypes.vec) = parsRequired.vec
# Check if any of the required parameters are missing or if parameters that are not supported are included
parsNotSupported = as.character(names(par.l)[which(!pars.df$parName %in% names(par.l))])
parsNotFound = as.character(pars.df$parName[which(!(pars.df$parName %in% names(par.l)))])
if (length(which(is.na(parsNotSupported))) > 0) {
parsNotSupported = parsNotSupported[-which(is.na(parsNotSupported))]
}
if (length(which(is.na(parsNotFound))) > 0) {
parsNotFound = parsNotFound[-which(is.na(parsNotFound))]
}
if (length(parsNotSupported) > 0) {
warning(paste("The following parameters are not supported and have been ignored: ", paste(parsNotSupported, collapse = ", "), sep = ""))
}
if (length(parsNotFound) > 0) {
stop(paste("The following mandatory parameters have not been found: ", paste( parsNotFound, collapse = ", "), sep = ""))
}
# Check that all parameters are valid
assert(checkInt(par.l$linesToParse, lower = -1, upper = -1),
checkInt(par.l$linesToParse, lower = 1))
assertIntegerish(par.l$binSize, lower = 1, any.missing = FALSE, len = 1)
assertIntegerish(par.l$regionSize, lower = 1, any.missing = FALSE, len = 1)
assertIntegerish(par.l$nCores, lower = 1, any.missing = FALSE, len = 1)
assertCharacter(par.l$path_userRegions, any.missing = FALSE, len = 1)
assertChoice(par.l$assemblyVersion, c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3", "sacCer3", "sacCer2"))
assertNumber(par.l$effectiveGenomeSizePercentage, lower = -1, upper = 1)
assertFlag(par.l$keepAllReadCounts)
assertFlag(par.l$readFlag_isPaired, na.ok = TRUE)
assertFlag(par.l$readFlag_isProperPair, na.ok = TRUE)
assertFlag(par.l$readFlag_isUnmappedQuery, na.ok = TRUE)
assertFlag(par.l$readFlag_hasUnmappedMate, na.ok = TRUE)
assertFlag(par.l$readFlag_isMinusStrand, na.ok = TRUE)
assertFlag(par.l$readFlag_isMateMinusStrand, na.ok = TRUE)
assertFlag(par.l$readFlag_isFirstMateRead, na.ok = TRUE)
assertFlag(par.l$readFlag_isSecondMateRead, na.ok = TRUE)
assertFlag(par.l$readFlag_isNotPrimaryRead, na.ok = TRUE)
assertFlag(par.l$readFlag_isNotPassingQualityControls, na.ok = TRUE)
assertFlag(par.l$readFlag_isDuplicate, na.ok = TRUE)
assertFlag(par.l$readFlag_reverseComplement, na.ok = TRUE)
assertFlag(par.l$readFlag_simpleCigar, na.ok = TRUE)
assertInt(par.l$readFlag_minMapQ, lower = 0)
assertFlag(par.l$zeroBasedCoordinates)
assertFlag(par.l$readGroupSpecific)
assertFlag(par.l$startOpen)
assertFlag(par.l$endOpen)
assertFlag(par.l$headerLine)
assertFlag(par.l$normByInput)
assertFlag(par.l$normAmongEachOther)
assertFlag(par.l$poolDatasets)
assertChoice(par.l$strand, c("sense", "antisense", "both"))
assertChoice(par.l$lastBinTreatment, c("keepUnchanged", "delete", "extend"))
if (par.l$binSize > (2* par.l$regionSize + 1)) {
stop("Value of parameter binSize not valid: Must be in the interval [1, 2* regionSize + 1].")
}
if (!par.l$readFlag_simpleCigar) {
stop("The parameter \"readFlag_simpleCigar\" must currently be set to TRUE in order to retrieve genotype information from the reads.")
}
if (par.l$nCores > multicoreWorkers()) {
warning("Requested ", par.l$nCores, " CPUs, but only ", multicoreWorkers(), " are available and can be used.")
par.l$nCores = multicoreWorkers()
}
if (verbose) message("SUCCESSFULLY FINISHED PARSING AND CHECKING THE CONFIGURATION FILE. PARSED PARAMETERS:")
for (i in seq_len(length(par.l))) {
if (verbose) message(" Parameter \"", names(par.l)[i],"\": \"", par.l[[i]],"\"")
}
par.l
}
# Although BAM is zero-based internally, samtools and RSamtools use 1-based positions.0-based coordinate system:
# A coordinate system where the first base of a sequence is zero. In this coordinate system, a region is specified by a half-closed-half-open interval.
# For example, the region between the 3rd and the 7th bases inclusive is [2 ; 7). The BAM, BCFv2, BED, and PSL formats are using the 0-based
# coordinate system
#' @import Rsamtools
# @importFrom Rsamtools scanBamWhat
#' @import checkmate
.extractFromBAM <- function(ranges.gr,
file,
fieldsToRetrieve,
par.l,
verbose = TRUE) {
# Check types and validity of arguments
assertClass(ranges.gr, "GRanges")
assertFileExists(file, access = "r")
assertSubset(fieldsToRetrieve, scanBamWhat(), empty.ok = FALSE)
assertList(par.l, min.len = 1, all.missing = FALSE)
assertFlag(verbose)
.extractFromBAMGen(regions.gr = ranges.gr,
file = file,
fieldsToRetrieve = fieldsToRetrieve,
flags = .constructScanBamFlags(par.l),
readGroupSpecificity = par.l$readGroupSpecific,
simpleCigar = par.l$readFlag_simpleCigar,
reverseComplement = par.l$readFlag_reverseComplement,
minMapQ = par.l$readFlag_minMapQ,
verbose = verbose)
}
# @importFrom GenomicRanges start end strand seqnames
#' @import GenomicRanges
#' @import checkmate
.createBins <- function(userRegions.gr,
par.l,
applyLastBinStrategy = TRUE,
chrSizes.df,
verbose = TRUE) {
# Check types and validity of arguments
assertClass(userRegions.gr, "GRanges")
assertList(par.l, any.missing = FALSE, min.len = 1)
assertFlag(applyLastBinStrategy)
assertDataFrame(chrSizes.df, min.rows = 1, min.cols = 2, col.names = NULL)
assertFlag(verbose)
# Bin each region according to the binSize parameter. Start from the leftmost position and continue to the right
binSize = par.l$binSize
nRegionsCur = length(userRegions.gr)
# Obtain the number of reads for each bin
.createStartSeq <- function(x, userRegions.gr, binSize) {
seq.int(start(userRegions.gr)[x], end(userRegions.gr)[x], binSize)
}
.createEndSeq <- function(x, userRegions.gr, binSize) {
seq.int(start(userRegions.gr)[x], end(userRegions.gr)[x], binSize) + binSize - 1
}
# Check here: https://support.bioconductor.org/p/66050/
# Parallelize functions
binStarts = .execInParallelGen(nCores = par.l$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .createStartSeq, userRegions.gr, binSize)
binEnds = .execInParallelGen(nCores = par.l$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .createEndSeq, userRegions.gr, binSize)
nBinsPerRegion.vec = sapply(binEnds, length)
# Adjust binEnds: The last row may contain ends that are larger than the actual ends due to potentially shorter last bins
# Because we modify an existing object within a function, use the global < <- operator.
# However, DO NOT USE THE <- - in a parallel environment, is does not work! That's why we use a regular for loop here
for (i in seq_len(nRegionsCur)) {
index = nBinsPerRegion.vec[i]
binEnds[[i]][index] = min(binEnds[[i]][index], end(userRegions.gr)[i])
}
.funLength <- function(x, nBinsPerRegion.vec) {rep.int(x, nBinsPerRegion.vec[x])}
.funStrand <- function(x, userRegions.gr, nBinsPerRegion.vec) {rep(as.character(strand(userRegions.gr)[x]), nBinsPerRegion.vec[x])}
.funChr <- function(x, userRegions.gr, nBinsPerRegion.vec) {rep(as.character(seqnames(userRegions.gr)[x]), nBinsPerRegion.vec[x])}
bins.l = list()
bins.l$chr = .execInParallelGen(nCores = par.l$nCores,
returnAsList = FALSE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .funChr, userRegions.gr, nBinsPerRegion.vec)
bins.l$start = unlist(binStarts, use.names = FALSE)
bins.l$end = unlist(binEnds, use.names = FALSE)
bins.l$strand = .execInParallelGen(nCores = par.l$nCores,
returnAsList = FALSE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .funStrand, userRegions.gr, nBinsPerRegion.vec)
bins.l$origRegionIndex = .execInParallelGen(nCores = par.l$nCores,
returnAsList = FALSE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .funLength, nBinsPerRegion.vec)
bins.l$length = bins.l$end - bins.l$start + 1
# If the strand information is discarded, change strand to *
if (par.l$strand == "both") {
bins.l$strand = rep("*", length(bins.l$strand))
} else if (par.l$strand == "antisense") {
bins.l$strand = gsub("+",".", bins.l$strand, fixed = TRUE)
bins.l$strand = gsub("-","+", bins.l$strand, fixed = TRUE)
bins.l$strand = gsub(".","-", bins.l$strand, fixed = TRUE)
}
if (applyLastBinStrategy) {
# The last bin may be shorter: Option to delete it, extend it beyond the requested boundaries or leave it as it is
if (par.l$lastBinTreatment == "keepUnchanged") {
if (verbose) message(" The last bin in each region will be left untouched.")
} else if (par.l$lastBinTreatment == "delete") { # original user regions remain unchanged, because the last bin is deleted and not considered although the reads mapping there were already retrieved
indexToDeleteSignal = which(bins.l$length < binSize )
if (length(indexToDeleteSignal) > 0) {
if (verbose) message(" The last bin in each region will be deleted as they are shorter than the other bins (",
paste0(unique(bins.l$length[indexToDeleteSignal]), collapse = ","), " bp as compared to ", binSize, " bp).")
bins.l$chr = bins.l$chr [-indexToDeleteSignal]
bins.l$start = bins.l$start [-indexToDeleteSignal]
bins.l$end = bins.l$end [-indexToDeleteSignal]
bins.l$strand = bins.l$strand [-indexToDeleteSignal]
bins.l$origRegionIndex = bins.l$origRegionIndex[-indexToDeleteSignal]
bins.l$length = bins.l$length [-indexToDeleteSignal]
#Each index that is deleted belongs to one region. Thus, the number of bins has to be adjusted in the nBinsPerRegion.vec vector
.funBinsPerRegion <- function(x, bins.l, nRegionsCur) {
length(which(bins.l$origRegionIndex == seq_len(nRegionsCur)[x]))
}
nBinsPerRegion.vec = .execInParallelGen(nCores = par.l$nCores,
returnAsList = FALSE,
listNames = NULL,
iteration = seq_len(nRegionsCur),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .funBinsPerRegion, bins.l, nRegionsCur)
# Adjust the user regions end coordinates accordingly
nBins = unique(table(bins.l$origRegionIndex))
assertIntegerish(nBins, len = 1)
end(userRegions.gr) = start(userRegions.gr) + (nBins * par.l$binSize) - 1
}
} else if (par.l$lastBinTreatment == "extend") { # In this particular case, the user regions have to be adapted in order to also retrieve the reads from the extended region associated with the last bin
indexToExtend = which(bins.l$length < binSize )
if (length(indexToExtend ) > 0) {
if (verbose) message(" The last bin in each region will be extended outside of the specified region to the length of the other bins.")
bins.l$end[indexToExtend] = bins.l$end[indexToExtend] + (binSize - bins.l$length[indexToExtend])
bins.l$length[indexToExtend] = bins.l$end[indexToExtend] - bins.l$start[indexToExtend] + 1
# Check if chromosome boundaries may be crossed due to the extension
indexBoundary = which(bins.l$end > chrSizes.df[bins.l$chr,]$size)
if (length(indexBoundary) > 0) {
if (verbose) message(" WARNING: Due to the extension of the last bin, at least one bin crossed the chromosome end and had to be adjusted to the last position of the chromosome.")
chrSizes.df = .getGenomeData(par.l$assemblyVersion)
bins.l$end[indexBoundary] = chrSizes.df[bins.l$chr[indexBoundary],]$size
}
# Extend the user regions accordingly
end(userRegions.gr)[bins.l$origRegionIndex[indexToExtend]] = bins.l$end[indexToExtend]
}
}
} # end applyLastBinStrategy
list("bins" = bins.l, "userRegions" = userRegions.gr)
}
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges countOverlaps
#' @importFrom IRanges IRangesList IRanges
.calculateOverlapsBin <- function(bamObj,
par.l,
bins.l,
nOverlapsRegion.vec,
verbose) {
# Check types and validity of arguments
assertList(bamObj, any.missing = FALSE, min.len = 1)
assertList(par.l, min.len = 1, all.missing = FALSE)
assertList(bins.l, min.len = 4, all.missing = FALSE)
assertIntegerish(nOverlapsRegion.vec, lower = 0, any.missing = FALSE, min.len = 1)
nBinsPerRegion.vec = table(bins.l$origRegionIndex)
nColumnsMax = max(nBinsPerRegion.vec)
nRegionsCur = length(nOverlapsRegion.vec)
# Create a matrix. Rownames = IDs of regions. Colnames: Bins
results.m = matrix(0, nrow = nRegionsCur , ncol = nColumnsMax)
# If we have only one bin, save some computational time
if (nColumnsMax == 1) {
results.m[,1] = nOverlapsRegion.vec
return(results.m)
}
# Do all the hard work. Compare the bin from each region with the reads originating from that region
# Use apply to do it once per original region to reduce the necessary comparisons appropriately so the resulting list contains only what is needed
# There are two possible options: One with IRanges and one with GRanges. The former is faster because there is no sequence and strand information involved.
# The sequence names are identical among all ranges so this can be ignored here. The strand is covered elsewhere also.
# New version of Bioconductor makes this operation much faster: 3 to 10x. See https://stat.ethz.ch/pipermail/bioc-devel/2014-December/006749.html and check if a code change is needed
.calcOverlapsBins <- function(x, bins.l, bamObj) {
index = which(bins.l$origRegionIndex == x)
countOverlaps(IRangesList(start = bins.l$start[index], end = bins.l$end[index]),
IRangesList(IRanges(start = bamObj[[x]]$pos, width = bamObj[[x]]$qwidth)))
}
# Make faster and only calculate overlaps if the region count in nOverlapsRegion.vec is > 0
indexes.vec = setdiff(seq_len(nRegionsCur), which(nOverlapsRegion.vec == 0))
if (length(indexes.vec) > 0) {
# Compute in parallel if possible
results.l = .execInParallelGen(nCores = par.l$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = indexes.vec,
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .calcOverlapsBins, bins.l, bamObj)
# Slower but uses GRanges instead of IRanges
# results.l = sapply(seq_len(nRegionsCur), FUN = function(x) {index=which(bins.l$origRegionIndex == x);
# countOverlaps(GRanges(seqnames = bins.l$chr[index], ranges = IRanges(start=bins.l$start[index], end=bins.l$end[index]), strand= bins.l$strand[index]),
# GRanges(seqnames= bamSignal_cur[[x]]$rname, ranges = IRanges(start=bamSignal_cur[[x]]$pos, width=bamSignal_cur[[x]]$qwidth)))
# })
# nReadCounts.vec = unlist(results.l)
# Unlist, level 1
# Make sure the length of the individual elements is always one
length.vec = sapply(results.l, length)
assertSetEqual(length(unique(length.vec)),1)
nReadCounts.vec = .execInParallelGen(nCores = par.l$nCores,
returnAsList = FALSE,
listNames = NULL,
iteration = results.l,
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = match.fun("[["), 1)
# Save the read counts into the matrix
# A parallelized version caused a bug before, so let's switch back to the original version. Since nRegionsCur is not so large, this does not cost much execution time
for (x in indexes.vec) {
# Make sure that the startIndex is set correctly and that no gaps form when some rows are skipped
indexVec = which(indexes.vec == x)
startIndex = sum(nBinsPerRegion.vec[seq_len(indexVec - 1)]) + 1
results.m[x,1:nBinsPerRegion.vec[x]] = nReadCounts.vec[startIndex:(startIndex + nBinsPerRegion.vec[indexVec] - 1)]
}
} else {# No overlaps at all
# Nothing to do here, maybe except a warning
}
results.m
}
#' @importFrom IRanges as.matrix IRanges
#' @import checkmate
#' @import GenomicRanges
#' @importFrom S4Vectors as.table
.calculateOverlapsReads <- function(startPos.l,
width.l,
par.l,
bins.l,
nOverlapsRegion.vec,
verbose) {
# Check types and validity of arguments
assertList(startPos.l, any.missing = FALSE, len = length(nOverlapsRegion.vec))
assertList(width.l, any.missing = FALSE, len = length(nOverlapsRegion.vec))
assertList(par.l, min.len = 1, all.missing = FALSE)
assertList(bins.l, min.len = 4, all.missing = FALSE)
assertIntegerish(nOverlapsRegion.vec, lower = 0, any.missing = FALSE, min.len = 1)
nBinsPerRegion.vec = table(bins.l$origRegionIndex)
nColumnsMax = max(nBinsPerRegion.vec)
nRegionsCur = length(nOverlapsRegion.vec)
# Create a matrix. Rownames = IDs of regions. Colnames: Bins
results.l = list(start = vector( "list", nRegionsCur),
end = vector( "list", nRegionsCur))
# If we have only one bin, save some computational time
if (nColumnsMax == 1) {
# All reads overlap with only bin 1, start and end
results.l = lapply(1:nRegionsCur,
function(x) list(start = rep(1, nOverlapsRegion.vec[x]), end = rep(1, nOverlapsRegion.vec[x])))
# Transform to a slightly different nested format and return
return(list(start = lapply(results.l, "[[", "start"), end = lapply(results.l, "[[", "end")))
}
# Do all the hard work. See .calculateOverlapsBins, just switched around
.calcOverlapsReads <- function(x, bins.l, startPos.l, width.l) {
index = which(bins.l$origRegionIndex == x)
res2 = findOverlaps(IRanges(start = startPos.l[[x]], width = width.l[[x]]),
IRanges(start = bins.l$start[index], end = bins.l$end[index]),
type = "any")
hitsList.m = as.matrix(res2)
# Grep the start bins from the subjectHits column
start = as.numeric(hitsList.m[which(!duplicated(hitsList.m[,1])),2])
# Now use a trick to extract the end column: Use as.table to convert to a table that lists for each query how often it overlapped.
# Because overlaps in a particular user region must be, by definition, adjacent, the end equals the start + length -1
end = as.numeric(start + (as.numeric(as.table(res2)) - 1))
stopifnot(length(start) == length(end))
# Old: Too memory consuming because a lot of regions have a low number of reads
#RleList(list(start=start, end=end), compress = TRUE)
list(start = start, end = end)
}
# Make faster and only calculate overlaps if the region count in nOverlapsRegion.vec is > 0
indexes.vec = setdiff(seq_len(nRegionsCur), which(nOverlapsRegion.vec == 0))
if (length(indexes.vec) > 0) {
# Compute in parallel if possible
resultsTmp.l = .execInParallelGen(nCores = par.l$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = indexes.vec,
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .calcOverlapsReads, bins.l, startPos.l, width.l)
# Get back the correct order because some 0 regions may have been skipped
for (i in seq_len(length(resultsTmp.l))) {
results.l[["start"]][[indexes.vec[i]]] = resultsTmp.l[[i]]$start
results.l[["end"]] [[indexes.vec[i]]] = resultsTmp.l[[i]]$end
}
rm(resultsTmp.l)
} else {# No overlaps at all
stop("No overlaps found. Abort.")
}
results.l
}
#' @import checkmate
.calcRandomBackgroundDistr <- function(nRegions,
nBins,
readOverlaps.l) {
assertInt(nRegions, lower = 1)
assertInt(nBins, lower = 1)
assertList(readOverlaps.l)
res.l = list()
for (alleleCur in names(readOverlaps.l)) {
res.l[[alleleCur]] = matrix(0, nrow = nRegions, ncol = nBins)
}
# Loop through both alleles and all regions, and randomly reassign each read to one of the two alleles
# If the number of bins is 1, the result of this function will always be the same because alleles are chosen 50%50 and
# there are no other overlapping bins so no variation is possible
stopifnot(nRegions == length(readOverlaps.l[[1]][["start"]]))
for (alleleCur in names(readOverlaps.l)) {
nReads = sapply(readOverlaps.l[[alleleCur]][["start"]], length)
# Assign each read to read group 1 initially
rnd.l = sapply(1:nRegions, function(x) rep(1, nReads[x]))
for (regionCur in which(nReads > 0)) {
# Randomly assign each read to one of the two alleles, named 1 and 2 hereafter for simplicity. Make sure that the distribution is 50/50
rnd.l[[regionCur]][sample(1:nReads[regionCur], floor(nReads[regionCur] / 2), replace = FALSE)] = 2
rndAlleles = names(readOverlaps.l)[rnd.l[[regionCur]]]
binStarts = as.numeric(readOverlaps.l[[alleleCur]] [["start"]] [[regionCur]] )
binEnds = as.numeric(readOverlaps.l[[alleleCur]] [["end"]] [[regionCur]] )
# For each read, increase the randomly assigned allele assignment at the particular bins the read overlaps with
for (indexRead in seq_len(nReads[regionCur])) {
cols = binStarts[indexRead]:binEnds[indexRead]
indexRead = rndAlleles[indexRead]
res.l[[indexRead]] [regionCur, cols] = res.l[[indexRead]] [regionCur, cols] + 1
}
}
} # end for (alleleCur in names(readOverlaps.l))
res.l
}
#' @import checkmate
.constructScanBamFlags <- function(par.l) {
assertList(par.l, min.len = 1, all.missing = FALSE)
.constructScanBamFlagsGen(isPaired = par.l$readFlag_isPaired,
isProperPair = par.l$readFlag_isProperPair,
isUnmappedQuery = par.l$readFlag_isUnmappedQuery,
hasUnmappedMate = par.l$readFlag_hasUnmappedMate,
isMinusStrand = par.l$readFlag_isMinusStrand,
isMateMinusStrand = par.l$readFlag_isMateMinusStrand,
isFirstMateRead = par.l$readFlag_isFirstMateRead,
isSecondMateRead = par.l$readFlag_isSecondMateRead,
isSecondaryAlignment = par.l$readFlag_isNotPrimaryRead,
isNotPassingQualityControls = par.l$readFlag_isNotPassingQualityControls,
isDuplicate = par.l$readFlag_isDuplicate
)
}
#' Main function of \emph{SNPhood}
#'
#' \code{analyzeSNPhood} is the main function of the \code{SNPhood} package.
#' All results, parameters and metadata are stored in an object of class \code{\linkS4class{SNPhood}}.
#'
#' \strong{If you already have BAM files in objects of class \code{\linkS4class{BamFile}} or \code{\linkS4class{BamFileList}},
#' see the function \code{\link{collectFiles}} for how to seemlessly integrate them into the \code{SNPhood} framework.}
#'
#' In addition, see the vignettes for more details.
#'
#' @param par.l Named list. Named list with all required parameter names and their respective values, which
#' should be generated via the helper function \code{\link{getDefaultParameterList}}.
#' Note that all supported parameters must be defined in the list, as obtained by the function \code{\link{getDefaultParameterList}} .
#' See also ?getDefaultParameterList for details.
#' @param files.df Data frame with at least the column "signal" specifying the absolute paths to the BAM files that will be processed.
#' Optionally, further columns can be added.
#' Supported are "input", "individual" and "genotype". See the Vignette for further details.
#' The data frame can either be created manually or via the helper function \code{\link{collectFiles}}.
#' @param onlyPrepareForDatasetCorrelation Logical(1). Default FALSE. If set to TRUE, only steps necessary to analyze
#' the correlation among datasets with respect to their read counts are calculated, which is less thsan time-consuming than running the full pipeline.
#' This is a quality control step to identify outlier datasets
#' that show artefacts and that should therefore be removed from the analysis. If set to FALSE (the default), the full pipeline is
#' executed. In both cases, the function \code{\link{plotAndCalculateCorrelationDatasets}} can be executed afterwards.
#' @template verbose_TRUE
#'
#' @return Object of class \code{\linkS4class{SNPhood}}. See the class description (?"SNPhood-class", or click the link) for details.
#' @examples
#' ## For the following example, see also the workflow vignette!
#' library(SNPhoodData)
#' # get a list of files to process
#' dataDir = system.file("extdata", package = "SNPhoodData")
#' files.df = collectFiles(patternFiles = paste0(dataDir,"/*.bam"))
#' files.df$individual = c("GM10847", "GM10847", "GM12890", "GM12890")
#' fileUserRegions = list.files(pattern = "*.txt",dataDir, full.names = TRUE)
#' par.l = getDefaultParameterList(path_userRegions = fileUserRegions)
#' par.l$poolDatasets = TRUE
#' # Run the main function with the full pipeline
#' SNPhood.o = analyzeSNPhood (par.l, files.df)
#' @export
#' @import checkmate
#' @import Rsamtools
# @importFrom Rsamtools
analyzeSNPhood <- function(par.l,
files.df,
onlyPrepareForDatasetCorrelation = FALSE,
verbose = TRUE) {
# Check types and validity of arguments
if (!testList(par.l, all.missing = FALSE, len = length(getDefaultParameterList("/path/to/userRegions.example")))) {
stop("The provided argument par.l is not valid. It must be a named list of length ",
length(getDefaultParameterList("test")),
". The names of the list must be the names of the required parameters. See ?analyzeSNPhood, ?getDefaultParameterList and the vignettes for details.")
}
# memory.profile()
if (verbose) message("Total size of all objects: ", round(sum(memory.profile()) / (1024*1024),1), " Mb")
assertDataFrame(files.df, min.rows = 1, min.cols = 1)
assertFlag(onlyPrepareForDatasetCorrelation)
assertFlag(verbose)
start.time <- Sys.time()
if (verbose) message("\n\nSTART WITH AUTOMATED PIPELINE\n")
if (verbose) message("\nThe following arguments have been provided:")
##########################################################
# Check the configuration file and files to be processed #
##########################################################
par.l = .checkConfigFile(par.l, verbose)
# Make sure that the files data frame does not contain any factors
files.df[] <- lapply(files.df, as.character)
files.df$input[which(files.df$input == "")] = NA
# Remove spaces at the beginning and the end and after commas as well as newline characters
files.df$input = gsub("[\r\n]", "", files.df$input)
files.df$input = sub('^\\s+','' , files.df$input, perl = TRUE)
files.df$input = sub(',\\s+',',', files.df$input, perl = TRUE)
if (par.l$normByInput & par.l$readGroupSpecific) {
warning("Input normalization has been turned off because read-group specific reads have been requested. Currently, input normalization is not possible with read-group specific reads.")
files.df$input = NA
par.l$normByInput = FALSE
}
if (par.l$normByInput & par.l$effectiveGenomeSizePercentage == -1 & !par.l$assemblyVersion %in% c("hg38", "hg19", "hg18", "mm10", "mm9", "dm3")) {
stop("For the specified genome assembly, SNPhood does not have default values for the parameter effectiveGenomeSizePercentage. Please adjust the value manually. See the vignette or help pages for details.")
}
if (par.l$normByInput & any(is.na(files.df$input))) {
stop("Input normalization has been enabled, but at least one file has no corresponding input file as a control.")
}
if (!par.l$normByInput & any(!is.na(files.df$input))) {
stop("Input normalization has been disabled, but filenames for input files have been provided. Either enable input normalization or set input files to NA")
}
# Make sure the order of appearance does not play a role in files.df$input
if (!any(is.na(files.df$input))) {
input.split = strsplit(files.df$input, ",", fixed = TRUE)
input.split.length = sapply(input.split, length)
for (i in which(input.split.length > 1)) {
files.df$input[i] = paste(sort(input.split[[i]]), collapse = ",")
}
}
# Not possible to use the same signal files multiple times.
assertSetEqual(length(unique(files.df$signal)), length(files.df$signal))
if (par.l$poolDatasets) {
if (!"individual" %in% colnames(files.df)) stop("The parameter \"poolDatasets\" is set to TRUE but the column \"individual\" was not found in data frame files.df." )
# Check if all files belonging to the same individual have identical input files
for (individual in unique(files.df$individual)) {
if (length(unique(files.df[which(files.df$individual == individual),]$input)) > 1) {
stop("Different input files are assigned across all individuals with ID \"", individual,"\". Verify that the set of input files is identical among data sets with the same individual ID", sep = "")
}
}
} else {
files.df$individual = files.df$signal # Overwrite custom individual IDs to make sure we don't store redundant information (signal file and the corresponding ID) in the final object
}
nFilesToProcess = nrow(files.df)
nIndividualsToProcess = length(unique(files.df$individual))
# Make sure the normalization among the different data files is only performed when necessary und useful
if (par.l$normByInput | nFilesToProcess == 1 | par.l$readGroupSpecific) {
if (par.l$normAmongEachOther) warning("Forcing parameter normAmongEachOther to FALSE because either input normalization is turned on, only one files is going to be processed, or because allele-specific reads are requested")
par.l$normAmongEachOther = FALSE
}
###############################
# Check genotype integration #
###############################
integrateGenotype = TRUE
if (!"genotype" %in% colnames(files.df)) {
integrateGenotype = FALSE
files.df$genotype = NA
files.df$genotype = as.character(files.df$genotype)
} else {
if (all(is.na(files.df$genotype))) {
integrateGenotype = FALSE
} else {
index = which(!is.na(files.df$genotype))
res.split = strsplit(files.df$genotype[index], split = ":")
len = sapply(res.split,length)
assertInteger(unique(len), len = 1, lower = 2, upper = 2)
genotypeMapping.df = data.frame(samples = files.df$signal[index],
genotypeFile = sapply(res.split, "[[", 1),
sampleName = sapply(res.split, "[[", 2)
)
}
}
if (par.l$poolDatasets & integrateGenotype) {
stop("Cannot integrate genotypes automatically because of dataset pooling and potential mapping mixups. Please set the genotype column in the input data.frame to NA, and call the function again. To automatically integrate the genotypes, call the function associateGenotypes manually afterwards by prodiving a 1:1 mapping for individuals and genotypes.")
}
# Put into parameter list
par.l$input = files.df
par.l$onlyPrepareForDatasetCorrelation = onlyPrepareForDatasetCorrelation
#########################
# Create SNPhood object #
#########################
SNPhood.o = .createSNPhoodObject(par.l, onlyPrepareForDatasetCorrelation)
countType = "readCountsRaw"
if (par.l$normByInput) countType = "enrichment"
if (par.l$normAmongEachOther) countType = "readCountsNormalized"
SNPhood.o@internal$countType = countType
###############################
# Parse the user regions file #
###############################
chrSizes.df = .getGenomeData(par.l$assemblyVersion, includeChrM = TRUE)
userRegions.gr = .parseAndProcessUserRegions(par.l, chrSizes.df, verbose = verbose)
nRegionsCur = length(userRegions.gr)
SNPhood.o@annotation$regions = userRegions.gr
names(SNPhood.o@annotation$regions) = SNPhood.o@annotation$regions$annotation
SNPPos.gr = .getSNPGRangesObj(annotation(SNPhood.o))
###############################
# Split the regions into bins #
###############################
start.timeTemp <- Sys.time()
if (!onlyPrepareForDatasetCorrelation) {
# This has to be done before parsing the BAM file, as the user regions may change as a consequence of the lastBin parameter
if (verbose) message(" Split regions into bins")
res.l = .createBins(annotation(SNPhood.o)$regions, parameters(SNPhood.o), applyLastBinStrategy = TRUE, chrSizes.df)
bins.l = res.l$bins
SNPhood.o@annotation$regions = res.l$userRegions
nRegionsBins = length(bins.l$chr)
nBinsCur = max(table(bins.l$origRegionIndex))
if (verbose) message(" Split ", nRegionsCur," entries into ", nRegionsBins," bins")
SNPhood.o@annotation$bins = paste0("bin_", seq_len(nBinsCur))
# Store the bin number where the SNP is located
SNPhood.o@internal$plot_origBinSNPPosition = (nBinsCur + 1)/2
SNPhood.o@internal$plot_labelBins = .calcBinLabelsPlot(nBinsCur)
}
.printExecutionTime(start.timeTemp, verbose)
#.getMemoryProfile(SNPhood.o, verbose = verbose)
#######################################################
# Prepare the data structure if not initialzed before #
#######################################################
# Parse the first signal file and determine the number and names of the read groups
res.l = .BAMHeaderConsistencyChecks(par.l, SNPhood.o@annotation$regions, files.df$signal[1], NULL, verbose = TRUE)
SNPhood.o@config$readGroupSpecific = res.l$readGroupSpecific
SNPhood.o = .initSNPhoodObject(SNPhood.o, files.df, res.l$readGroups, onlyPrepareForDatasetCorrelation)
###############################################
# ITERATE OVER ALL UNIQUE SETS OF INPUT FILES #
###############################################
# Check if the set of input files has already been read in before
# Use an extra data structure to track which files or combinations of files have already been processed
processedFileSets.l = list()
inputFileSetCounter = 0
for (inputFileSetCur in unique(files.df$input)) {
start_inputSet = Sys.time()
inputFileSetCounter = inputFileSetCounter + 1
if (verbose) message("\nPROCESS INPUT FILE SET ", inputFileSetCur, " (", inputFileSetCounter, " of ", length(unique(files.df$input)), ")")
# Identify which rows have this set of input files
indexRowsInput.vec = which(files.df$input == inputFileSetCur)
if (is.na(inputFileSetCur)) {
indexRowsInput.vec = which(is.na(files.df$input))
}
if (par.l$normByInput) {
inputFileSetCur.vec = strsplit(inputFileSetCur, ",", fixed = TRUE)[[1]]
if (verbose) message(" Corresponding set of input files that will be used as a control: ", paste0(seq_len(length(inputFileSetCur.vec)), ":", inputFileSetCur.vec, collapse = ","))
#################################################################
# ITERATE OVER ALL UNIQUE INPUT FILES THAT ARE PART OF THIS SET #
#################################################################
if (testNull(processedFileSets.l[[inputFileSetCur]])) {
readGroups = annotationReadGroups(SNPhood.o)
results.l = .extractAndNormalize(par.l, SNPhood.o@annotation$regions, readGroups, SNPPos.gr, inputFileSetCur.vec, inputFileSetCur, bins.l, type = "input", verbose = TRUE)
SNPhood.o@internal$globalBackground[[inputFileSetCur]] = results.l$globalBackground
# Will only be one read group here, otherwise normalization cannot happen
for (alleleCur in readGroups) {
SNPhood.o@readCountsBinned [[alleleCur]] [[inputFileSetCur]] = results.l$readCountsBinned [[alleleCur]]
SNPhood.o@readCountsUnbinned [[alleleCur]] [[inputFileSetCur]] = results.l$readCountsUnbinned [[alleleCur]]
}
rm(results.l)
}
processedFileSets.l[[inputFileSetCur]] = TRUE
if (verbose) message("\nFINISHED PARSING INPUT FILES.\n")
if (verbose) message("\nPROCESS INDIVIDUALS WITH INPUT SET ", inputFileSetCur)
}
#################################################################################
# ITERATE OVER ALL UNIQUE SETS OF INDIVIDUALS WITH THE GIVEN SET OF INPUT FILES #
#################################################################################
individualsToProcess = as.character(unique(files.df$individual[indexRowsInput.vec]))
nIndividualsToProcess = length(individualsToProcess)
nIndProcessed = 0
for (individualCur in individualsToProcess) {
nIndProcessed = nIndProcessed + 1
# Identify all rows that match
indexRowsSignal.vec = which(files.df$individual == individualCur)
if (verbose) message("\nPROCESS INDIVIDUAL ", nIndProcessed," from ", nIndividualsToProcess,": ", individualCur)
start_individual = Sys.time()
##############################################################
# ITERATE OVER ALL SIGNAL FILES WITH THE GIVEN INDIVIDUAL ID #
##############################################################
signalFilesIndAll = as.character(unique(files.df$signal[indexRowsSignal.vec]))
SNPhood.o@annotation$files[[individualCur]] =
list(type = "signal", files = signalFilesIndAll, composite = TRUE, origName = individualCur, input = NA)
if (par.l$normByInput) {
# Add the information about corresponding input files
isComposite = ifelse(length(inputFileSetCur.vec) > 1, TRUE, FALSE)
SNPhood.o@annotation$files[[individualCur]]$input =
list(files = inputFileSetCur.vec, composite = isComposite, origName = inputFileSetCur)
}
####################################################################
# Parse all files that belong to the individual and extract counts #
####################################################################
readGroups = annotationReadGroups(SNPhood.o)
results.l = .extractAndNormalize(par.l, SNPhood.o@annotation$regions, readGroups, SNPPos.gr, signalFilesIndAll, individualCur, bins.l, type = "signal", verbose = TRUE)
SNPhood.o@internal$globalBackground[[individualCur]] = results.l$globalBackground
for (alleleCur in readGroups) {
SNPhood.o@internal$readStartPos[[alleleCur]] [[individualCur]] = results.l$readStartPos [[alleleCur]]
SNPhood.o@internal$readWidth [[alleleCur]] [[individualCur]] = results.l$readWidth [[alleleCur]]
SNPhood.o@readCountsBinned [[alleleCur]] [[individualCur]] = results.l$readCountsBinned [[alleleCur]]
SNPhood.o@readCountsUnbinned [[alleleCur]] [[individualCur]] = results.l$readCountsUnbinned [[alleleCur]]
SNPhood.o@annotation$genotype$readsDerived [[alleleCur]] [[individualCur]] = results.l$genotypes [[alleleCur]]
}
rm(results.l)
####################
# Normalize counts #
####################
if (par.l$normByInput) {
alleleCur = "allReadGroups"
if (verbose) message(" Normalize library sizes...")
counts.m = matrix(c(unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[individualCur]] , use.names = FALSE),
unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[inputFileSetCur]], use.names = FALSE)), byrow = FALSE, nrow = nRegionsCur)
stopifnot(all(counts.m >= 0))
colnames(counts.m) = c(individualCur, inputFileSetCur)
res.l = .scaleLibraries(counts.m)
# Update the library sizes and count matrices
sizeFactors.vec = res.l$sizeFactors
rm(res.l)
# Important: Append the size factors because both signal and input may be used multiple times with other signals and input files, respectively, each of which translates to a different size factor
# Change the name so that it can be traced back to the corresponding signal data file
# Switch so that the mapping is composed of a signal and the corresponding input datafile always
# Save the size factor. Reverse the names here to make the mapping properly. This is correct and verified!
# TODO: Remove, might be too complicated? Really worth saving in object?
names(sizeFactors.vec) = rev(names(sizeFactors.vec))
SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]] , sizeFactors.vec[inputFileSetCur])
SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]], sizeFactors.vec[individualCur])
###########################################
# Calculate enrichment ratio for each bin #
###########################################
# Divide the observed signal by the maximum of the global average per bin and the region mean.
# Thereby, make it less vulnerable to stochasticity as compared to calculating it by the observed counts in each bin
sizeFactorInput = SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]]
sizeFactorIndividual = SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]]
nReadsAvgBinGlobalInputNorm = SNPhood.o@internal$globalBackground[[inputFileSetCur]] / sizeFactorInput
nReadsAvgBinLocalInputNorm = rowMeans(SNPhood.o@readCountsBinned[[alleleCur]] [[inputFileSetCur]] / sizeFactorInput)
# The normalization factor is specific for each region but identical among bins of a particular region
# First produce a vector of length nRegions with the maximum of global and local bin
maxBinAvg.vec = pmax(nReadsAvgBinGlobalInputNorm, nReadsAvgBinLocalInputNorm)
# Now also incorporate the actual observed normalized read count values for each bin
maxAllInput.m = pmax(matrix(rep(maxBinAvg.vec, nBinsCur), ncol = nBinsCur),
(SNPhood.o@readCountsBinned[[alleleCur]] [[inputFileSetCur]] / sizeFactorInput)
)
SNPhood.o@enrichmentBinned[[alleleCur]] [[individualCur]] =
(SNPhood.o@readCountsBinned[[alleleCur]] [[individualCur]] / sizeFactorIndividual) / maxAllInput.m
rm(maxAllInput.m,maxBinAvg.vec)
} # end input size normalization
#.getMemoryProfile(SNPhood.o, verbose = verbose)
#############################################
# SAVE SOME MEMORY AND DELETE UNNEEDED DATA #
#############################################
if (!SNPhood.o@config$keepAllReadCounts) {
# Save memory and delete read count objects from the files before pooling
if (SNPhood.o@config$poolDatasets) {
# Delete files for which composite is not TRUE.
filesToDelete = SNPhood.o@annotation$files[[individualCur]]$files
if (par.l$normByInput) {
filesDelete = c(filesDelete, SNPhood.o@annotation$files[[individualCur]]$input$files)
}
for (fileCur in filesToDelete) {
SNPhood.o@annotation$files[[fileCur]] = NULL
for (readGroupCur in annotationReadGroups(SNPhood.o)) {
SNPhood.o@readCountsUnbinned [[readGroupCur]] [[fileCur]] = NULL
SNPhood.o@readCountsBinned [[readGroupCur]] [[fileCur]] = NULL
SNPhood.o@annotation$genotype$readsDerived[[readGroupCur]] [[fileCur]] = NULL
SNPhood.o@internal$readStartPos[[readGroupCur]] [[fileCur]] = NULL
SNPhood.o@internal$readWidth [[readGroupCur]] [[fileCur]] = NULL
}
}
}
# Reset slots readCountsBinned because it has been replaced by enrichment
for (readGroupCur in annotationReadGroups(SNPhood.o)) {
if (SNPhood.o@config$normByInput) {
SNPhood.o@readCountsBinned[[readGroupCur]][[individualCur]] = NULL
}
}
} # end if (!SNPhood.o@config$keepAllReadCounts)
# Optimize memory footprint and sort read start positions and width
if (nReadGroups(SNPhood.o) > 1) {
for (readGroupCur in annotationReadGroups(SNPhood.o)) {
nReads.vec = sapply(SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]], length)
for (regionCur in which(nReads.vec > 0)) {
sortOrder = order(SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]])
SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]] = SNPhood.o@internal$readStartPos[[readGroupCur]][[individualCur]][[regionCur]][sortOrder]
SNPhood.o@internal$readWidth [[readGroupCur]][[individualCur]][[regionCur]] = SNPhood.o@internal$readWidth [[readGroupCur]][[individualCur]][[regionCur]][sortOrder]
# Can it be further compressed?
uniqueVal = unique(SNPhood.o@internal$readWidth[[readGroupCur]][[individualCur]][[regionCur]])
if (length(uniqueVal) == 1) {
SNPhood.o@internal$readWidth[[readGroupCur]][[individualCur]][[regionCur]] = uniqueVal
}
}
}
} # end if (nReadGroups(SNPhood.o) > 1)
#.getMemoryProfile(SNPhood.o, verbose = verbose)
if (verbose) message("\nFINISHED PROCESSING INDIVIDUAL\n")
.printExecutionTime(start_individual, verbose = verbose)
} # end for (individualCur in unique(files.df$individual[indexRowsInput.vec]))
if (verbose) message("\nFINISHED PROCESSING INPUT SET ", inputFileSetCur, ".\n")
.printExecutionTime(start_inputSet, verbose = verbose)
} # end for each unique set of input files
if (onlyPrepareForDatasetCorrelation) {
if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALL INPUT FILES.\n")
.printExecutionTime(start.time, verbose = verbose)
warning("Note that you set the parameter \"onlyPrepareForDatasetCorrelation\" to TRUE. You will not be able to use any functionality except the sample correlation plots. For a full analysis, run the function again and set the parameter to FALSE.")
message("Warnings may have occurred, as indicated. Please check them carefully with unique(warnings()).")
return(SNPhood.o)
}
#######################################################################################
# Scale all libraries to a common size if no other normalization has been done before #
#######################################################################################
if (par.l$normAmongEachOther) {
if (verbose) message("Parameter \"normAmongEachOther\" has been set to TRUE. Normalize all files among each other.")
alleleCur = annotationReadGroups(SNPhood.o)
stopifnot(length(alleleCur) == 1)
# Construct a matrix with the region counts
counts.m = matrix(NA, ncol = nIndividualsToProcess , nrow = nRegionsCur)
uniqueInd = unique(files.df$individual)
for (i in seq_len(length(uniqueInd))) {
counts.m[, i] = unlist(SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]], use.names = FALSE)
}
colnames(counts.m) = uniqueInd
# Obtain size factors
res.l = .scaleLibraries(counts.m)
# Divide each matrix by the size factors
for (i in seq_len(length(uniqueInd))) {
SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]] = SNPhood.o@readCountsUnbinned[[alleleCur]] [[uniqueInd[i]]] / res.l$sizeFactors[i]
SNPhood.o@readCountsBinned [[alleleCur]] [[uniqueInd[i]]] = SNPhood.o@readCountsBinned [[alleleCur]] [[uniqueInd[i]]] / res.l$sizeFactors[i]
}
} # end if normalize among each other
##################################
# INTEGRATE GENOTYPE IF PROVIDED #
##################################
if (integrateGenotype) {
SNPhood.o = associateGenotypes(SNPhood.o, genotypeMapping.df, verbose)
}
############################
# DETERMINE HETEROZYGOSITY #
############################
SNPhood.o = .determineHeterozygosity(SNPhood.o, verbose = verbose)
# Set the flag that we now have an object that contains all data, validity checkIng is now active
SNPhood.o@internal$disableObjectIntegrityChecking = FALSE
.checkObjectValidity(SNPhood.o, verbose = verbose)
#.getMemoryProfile(SNPhood.o, verbose = verbose)
if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALL INPUT FILES.\n")
.printExecutionTime(start.time, verbose = verbose)
message("Warnings may have occurred, as indicated. Please check them carefully with unique(warnings()).")
return(SNPhood.o)
}
#' @import GenomicRanges
# @importFrom GenomicRanges mcols start<- end<-
#' @import checkmate
.getSNPGRangesObj <- function(annotation) {
SNPPos.gr = annotation$regions
start(SNPPos.gr) = mcols(SNPPos.gr)$SNPPos
end(SNPPos.gr) = mcols(SNPPos.gr)$SNPPos
SNPPos.gr
}
#' @import checkmate
.getFieldsForBAMParsing <- function(type) {
assertChoice(type, c("regions","SNPs"))
fieldsToRetrieve = c("strand","pos","qwidth")
if (type == "SNPs") {
fieldsToRetrieve = c("strand","pos", "seq")
}
fieldsToRetrieve
}
##############################
##############################
# ANALYSIS AND VISUALIZATION #
##############################
##############################
#' Perform an allelic bias tests for each user region and bin.
#'
#' \code{testForAllelicBiases} performs tests for allelic biases for each binned user region using binomial tests.
#' For the parameter \code{readGroups}, the name of exactly two read groups must be provided for which allelic ratio tests should be performed.
#' See the Vignette for more details.
#' @template SNPhood
#' @template readGroups
#' @param confLevel Numeric(1). Default 0.95. The confidence level for estimating the confidence intervals. Must be between 0 and 1.
#' @param nullHypothesisFraction Numeric(1). Default 0.5. The expected probability under the null hypothesis of not having any bias. Must be between 0 and 1.
#' @param calcBackgroundDistr Logical(1). Default \code{TRUE}. Should the background distribution be calculated? Note that this can be usually very time-consuming.
#' @param nRepetitions Integer(1). Default 10. Number of repetitions for calculating the background distribution.
#' Only relevant if \code{calcBackgroundDistr} is set to \code{TRUE}
#' @param pValuesToTestBackground Numeric. Default c(0.0001, 0.0005, 0.001, 0.005, seq(0.01,1,0.01)).
#' Set of p-values for which corresponding FDR values will be computed
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with all the data from the allelic bias test stored in the slot \code{additionalResults},
#' which can be easily retrieved via the accessor function \code{results}.
#' See the help pages of the result function (?results) or the vignette for details.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' ## Perform the test without calculating the background distribution
#' SNPhood.o = testForAllelicBiases (SNPhood.o, readGroups = c("paternal","maternal"))
#' str(results(SNPhood.o, type="allelicBias"), list.len = 8)
#' ## Check the parameters
#' results(SNPhood.o, type="allelicBias", elements = "parameters")
#' @export
#' @import checkmate
# TODO: Get rid of the object copying, but how?
testForAllelicBiases <- function(SNPhood.o,
readGroups,
confLevel = 0.95,
nullHypothesisFraction = 0.5,
calcBackgroundDistr = TRUE,
nRepetitions = 100,
pValuesToTestBackground = c(0.0001, 0.0005, 0.001, 0.005, seq(0.01,1,0.01)),
verbose = TRUE) {
start.time <- Sys.time()
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertSubset(readGroups, annotationReadGroups(SNPhood.o))
assertCharacter(readGroups, len = 2)
assertNumber(nullHypothesisFraction, lower = 0, upper = 1)
assertNumber(confLevel, lower = 0, upper = 1)
assertFlag(calcBackgroundDistr)
if (calcBackgroundDistr) {
assertInt(nRepetitions, lower = 1)
assertNumeric(pValuesToTestBackground, lower = 0, upper = 1, any.missing = FALSE)
}
# Only perform background distribution if the number of bins is at least 5.
# This is a temporary design decision and may be changed in the near future
if (calcBackgroundDistr & nBins(SNPhood.o) < 5) {
stop("The number of bins is smaller than 5. Currently, background normalization is only performed when the number of bins is at least 5. Please set calcBackgroundDistr to FALSE and run again. See the vignette for more help.")
}
# Get the matrices ready and perform a few tests
res.l = list()
res.l$pValue = list()
res.l$confIntervalMin = list()
res.l$confIntervalMax = list()
res.l$fractionEstimate = list()
res.l$background = list()
res.l$FDR_results = list()
# Perform binomial tests
nRegionsCur = nRegions(SNPhood.o)
nBinsCur = nBins(SNPhood.o)
# Bins only needed for the background calculation
if (calcBackgroundDistr) {
bins.l = .createBins(annotation(SNPhood.o)$regions, parameters(SNPhood.o), applyLastBinStrategy = TRUE, .getGenomeData(parameters(SNPhood.o)$assemblyVersion), verbose = verbose)$bins
}
for (individualCur in annotationDatasets(SNPhood.o)) {
if (verbose) message("Test allelic bias for dataset ",individualCur)
# Init the matrix
res.l$pValue[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
res.l$confIntervalMin[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
res.l$confIntervalMax[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
res.l$confIntervalMin[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
res.l$fractionEstimate[[individualCur]] = matrix(NA, nrow = nRegionsCur, ncol = nBinsCur)
# Do it for the correct alleles
indexAlleles = which(annotationReadGroups(SNPhood.o) %in% readGroups)
stopifnot(length(indexAlleles) == 2)
res.l$pValue[[individualCur]] =
matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
probSuccess = nullHypothesisFraction,
returnType = "p.value",
indexResult = 1,
conf.level = confLevel),
nrow = nRegionsCur, ncol = nBinsCur)
res.l$confIntervalMin[[individualCur]] =
matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
probSuccess = nullHypothesisFraction,
returnType = "conf.int",
indexResult = 1,
conf.level = confLevel),
nrow = nRegionsCur, ncol = nBinsCur)
res.l$confIntervalMax[[individualCur]] =
matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
probSuccess = nullHypothesisFraction,
returnType = "conf.int",
indexResult = 2,
conf.level = confLevel),
nrow = nRegionsCur, ncol = nBinsCur)
res.l$fractionEstimate[[individualCur]] =
matrix(.calcBinomTestVector(as.vector(SNPhood.o@readCountsBinned[[indexAlleles[1]]] [[individualCur]]),
as.vector(SNPhood.o@readCountsBinned[[indexAlleles[2]]] [[individualCur]]),
probSuccess = nullHypothesisFraction,
returnType = "estimate",
conf.level = confLevel),
nrow = nRegionsCur, ncol = nBinsCur)
if (calcBackgroundDistr) {
# First calculate the read overlaps. Parse the BAM files again for this. This is timely but only needed here
if (verbose) message(" Calculating read overlaps for the background distribution... This may take a while")
readOverlaps.l = list()
for (x in readGroups) {
if (verbose) message(" Read group ",x)
readOverlaps.l[[x]] = .calculateOverlapsReads(
SNPhood.o@internal$readStartPos[[x]] [[individualCur]],
SNPhood.o@internal$readWidth [[x]] [[individualCur]],
parameters(SNPhood.o),
bins.l,
SNPhood.o@readCountsUnbinned [[x]] [[individualCur]],
verbose)
}
pValuesMinReal = rep(NA, nRegionsCur)
# the - is important because there is no min.col so reverse the sign of the elements
pMinIndex = max.col(-res.l$pValue[[individualCur]])
pValuesMinReal = sapply(1:nRegionsCur, function(x) {res.l$pValue[[individualCur]][x, pMinIndex[x]]})
# Do the same for the shuffled results
res.l$background[[individualCur]] = list()
# Save the minimal p value per region and repetition here. Preallocate space for improved efficiency
pValuesMinSim = rep(NA, nRepetitions * nRegionsCur)
if (verbose)
message(" Calculate background distribution with ",nRepetitions," repetitions. This may take a while.")
# Calculate the background distribution in parallel. Get shuffled reads for both alleles
# Independent of x. Could also be removed from the function presumably
.callRandomBackgroundDistr <- function(x) {
.calcRandomBackgroundDistr(nRegionsCur, nBinsCur, readOverlaps.l)
}
res.shuffled.l = .execInParallelGen(nCores = parameters(SNPhood.o)$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = seq_len(nRepetitions),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .callRandomBackgroundDistr)
for (repCur in 1:nRepetitions) {
# pValuesMinSim2[[repCur]] = data.frame(pValueShuffled = numeric(nRegionsCur))
res.l$background[[individualCur]][[repCur]] = matrix(
.calcBinomTestVector(
as.vector(res.shuffled.l[[repCur]][[1]]),
as.vector(res.shuffled.l[[repCur]][[2]]),
returnType = "p.value",
indexResult = 1,
conf.level = confLevel
),
nrow = nRegionsCur, ncol = nBinsCur
)
# Determine the smallest p values for each region and each repetition
# min.col does not exist, so we just invert the values and use max.col
pMinIndex = max.col(-res.l$background[[individualCur]][[repCur]])
startIndex = (repCur - 1) * nRegionsCur + 1
endIndex = startIndex + nRegionsCur - 1
pValuesMinSim[startIndex:endIndex] = sapply(1:nRegionsCur, function(x) {
res.l$background[[individualCur]][[repCur]][x, pMinIndex[x]]
})
}
resManual.df = data.frame(FDR = numeric(),
pValueThreshold = numeric(),
nReal = numeric(),
nSim = numeric())
for (pThresCur in pValuesToTestBackground) {
# Average number of significant results
nSim = length(which(pValuesMinSim <= pThresCur)) / nRepetitions
# Observed number of real significant results
nReal = length(which(pValuesMinReal <= pThresCur))
# FDR is calculated over all user regions
fdr = nSim / (nReal + nSim)
if ((nReal + nSim) == 0) {
fdr = 0
}
resManual.df = rbind(resManual.df, data.frame(pValueThreshold = pThresCur,
FDR = fdr,
nReal = nReal,
nSim = nSim))
}
res.l$FDR_results[[individualCur]] = resManual.df
} # end if (calcBackgroundDistr)
} # end for (i in annotationDatasets(SNPhood.o)) {
# Finish up
readGroupsDiscarded = annotationReadGroups(SNPhood.o)[which(!annotationReadGroups(SNPhood.o) %in% readGroups)]
res.l$parameters = list(readGroupsTested = readGroups,
readGroupsDiscarded = readGroupsDiscarded,
confLevel = confLevel,
calculateBackgroundDistribution = calcBackgroundDistr,
nRepetitionsBackground = nRepetitions,
pValuesToTestBackground = pValuesToTestBackground)
SNPhood.o@additionalResults$allelicBias = res.l
if (!"allelicBias" %in% SNPhood.o@internal$addResultsElementsAdded) {
SNPhood.o@internal$addResultsElementsAdded = c(SNPhood.o@internal$addResultsElementsAdded, "allelicBias")
}
if (verbose) message("\n\nFINISHED SUCCESSFULLY WITH ALLELIC BIAS TEST.\n")
.printExecutionTime(start.time, verbose = verbose)
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Get the number of SNP regions for a \emph{SNPhood} object.
#'
#' \code{nRegions} is a helper function that returns the number of SNP regions that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of SNP regions that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nRegions(SNPhood.o)
#' @aliases regions
#' @rdname regions-methods
#' @export
nRegions <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
length(SNPhood.o@annotation$regions)
}
#' Get the number of bins for a \emph{SNPhood} object.
#'
#' Return the number of bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of bins that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nBins(SNPhood.o)
#' @export
#' @aliases bins
#' @rdname bins-methods
nBins <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
return(0)
}
length(SNPhood.o@annotation$bins)
}
#' Get the number of datasets for a \emph{SNPhood} object.
#'
#' Return the number of datasets that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of datasets that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nDatasets(SNPhood.o)
#' @export
#' @aliases datasets
#' @rdname datasets-methods
nDatasets <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
length(SNPhood.o@annotation$files)
}
#' Get the number of read groups for a \emph{SNPhood} object.
#'
#' Return the number of read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Integer. Number of read groups that are defined in the \code{\linkS4class{SNPhood}} object
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nReadGroups(SNPhood.o)
#' @export
#' @aliases readGroups
#' @rdname readGroups-methods
nReadGroups <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
length(SNPhood.o@annotation$readGroups)
}
#' Get the annotation of SNP regions for a \emph{SNPhood} object.
#'
#' Return the annotation of the SNP regions that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @param asGRangesObj Logical(1). Default FALSE. Should the full annotation be returned (as \code{GRanges} object) or only the annotation of the SNP regions (as character vector)?
#' @template verbose_FALSE
#' @return If \code{asGRangesObj} is set to \code{TRUE}, a \code{GRanges} object is returned. Otherwise, a character vector with the currently stored SNP annotation is returned.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' IDs.vec = annotationRegions(SNPhood.o, asGRangesObj = FALSE)
#' IDs.gr = annotationRegions(SNPhood.o, asGRangesObj = TRUE)
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges mcols
annotationRegions <- function(SNPhood.o,
asGRangesObj = FALSE,
verbose = FALSE) {
assertFlag(verbose)
assertFlag(asGRangesObj)
if (!asGRangesObj) {
return(mcols(SNPhood.o@annotation$regions)$annotation)
} else {
return(SNPhood.o@annotation$regions)
}
}
#' Get the annotation(names) of the datasets in a \emph{SNPhood} object.
#'
#' Return the names of the datasets/individuals that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the datasets/individuals that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationDatasets(SNPhood.o)
#' @export
annotationDatasets <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
names(SNPhood.o@annotation$files)
}
#' Get the annotation(names) of the read groups in a \emph{SNPhood} object.
#'
#' Return the names of the read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the read groups that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationReadGroups(SNPhood.o)
#' @export
annotationReadGroups <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
SNPhood.o@annotation$readGroups
}
#' Get the annotation(names) of the bins in a \emph{SNPhood} object.
#'
#' Return the names of the Bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @template verbose_FALSE
#' @return Character vector. Names of the bins that are defined in the \code{\linkS4class{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotationReadGroups(SNPhood.o)
#' @export
annotationBins <- function(SNPhood.o,
verbose = FALSE) {
assertFlag(verbose)
SNPhood.o@annotation$bins
}
#' Merges the counts of all read groups for a \emph{SNPhood} object
#'
#' \code{mergeReadGroups} merges the counts of all read groups for a \code{\linkS4class{SNPhood}} object. This function can only be executed if more than one read group is defined in the object and
#' if read counts have not been converted into allelic fractions. Also carefully note the warning below.
#' @template SNPhood
#' @param summaryFunction Character(1). Default "sum". Either "sum" or "mean". How should the read counts from different read groups be summarized.
#' If set to "sum", all counts are summed up, which yields values that are identical as running the main analysis non-allele-specifically.
#' If set to "mean", the mean value across all read groups is calculated.
#' @template verbose_TRUE
#' @return A modified \code{\linkS4class{SNPhood}} object with only one read group "allReadGroups", with all occurences of
#' the original read groups replaced by "allReadGroups". For object consistency, as mentioned in the warning below,
#' some results from analyses depending on read groups are removed completely.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' nReadGroups(SNPhood.o)
#' SNPhood_merged.o = mergeReadGroups(SNPhood.o)
#' nReadGroups(SNPhood.o)
#' @section Warning:
#' \strong{Merging read groups is irreversible. This transformation cannot be undone.}
#' It might therefore be advisable to save the resulting object in a new variable as shown in the examples.
#'
#' \strong{Results from the allelic bias test and clustering results will also be removed to keep the object consistent.}
#' @export
#' @import checkmate
mergeReadGroups <- function(SNPhood.o,
summaryFunction = "sum",
verbose = TRUE) {
# TODO: merge only a particular set of read groups
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertChoice(summaryFunction, c("mean", "sum"), .var.name = summaryFunction)
if (SNPhood.o@internal$isAllelicRatio) {
warning("Cannot merge read groups because read counts have been converted to allelic fractions. Object will remain unchanged.")
return(SNPhood.o)
}
uniqueReadGroups = annotationReadGroups(SNPhood.o)
if (length(uniqueReadGroups) == 1) {
warning("Cannot summarize read groups because only one read group was found: ", uniqueReadGroups,". Object will remain unchanged.", sep = "")
return(SNPhood.o)
}
nColumns = nBins(SNPhood.o)
nRows = nRegions(SNPhood.o)
for (i in 1:nDatasets(SNPhood.o)) {
# Need to store multiple matrices in memory to properly summarize them later
matrices.l = list()
matrices2.l = list()
for (j in seq_len(length(uniqueReadGroups))) {
matrices.l[[j]] = SNPhood.o@readCountsBinned [[uniqueReadGroups[j]]] [[i]]
matrices2.l[[j]] = SNPhood.o@readCountsUnbinned[[uniqueReadGroups[j]]] [[i]]
}
# Make a 3D array from the list of matrices
arr <- array( unlist(matrices.l) , c(nRows, nColumns, length(matrices.l)) )
arr2 <- array( unlist(matrices2.l) , c(nRows, nColumns, length(matrices2.l)) )
# Now summarize
if (summaryFunction == "mean") {
SNPhood.o@readCountsBinned [["allReadGroups"]] [[i]] = rowMeans(arr, dims = 2 )
SNPhood.o@readCountsUnbinned[["allReadGroups"]] [[i]] = rowMeans(arr2, dims = 2 )
} else if (summaryFunction == "sum") {
SNPhood.o@readCountsBinned [["allReadGroups"]] [[i]] = Reduce('+', matrices.l)
SNPhood.o@readCountsUnbinned[["allReadGroups"]] [[i]] = Reduce('+', matrices2.l)
}
}
names(SNPhood.o@readCountsBinned [["allReadGroups"]]) = names(SNPhood.o@readCountsBinned[[1]])
names(SNPhood.o@readCountsUnbinned[["allReadGroups"]]) = names(SNPhood.o@readCountsUnbinned[[1]])
# Currently, the enrichment slot will be empty because enrichment is only possible if done non-allele-specifically
# Delete old read groups
for (readGroupCur in uniqueReadGroups) {
SNPhood.o@readCountsBinned[[readGroupCur]] = NULL
SNPhood.o@readCountsUnbinned[[readGroupCur]] = NULL
SNPhood.o@enrichmentBinned[[readGroupCur]] = NULL
SNPhood.o@internal$sizeFactors[[readGroupCur]] = NULL
# Delete clustering results also
if (!testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned[[readGroupCur]])) {
SNPhood.o@additionalResults$clustering[[readGroupCur]] = NULL
SNPhood.o@additionalResults$clustering[[readGroupCur]] = NULL
}
}
# Reset, not meaningful to merge the old values anyway
SNPhood.o@enrichmentBinned[["allReadGroups"]] = list()
SNPhood.o@internal$sizeFactors[["allReadGroups"]] = list()
# Read start pos and width
SNPhood.o@internal$readStartPos[["allReadGroups"]] = list()
SNPhood.o@internal$readWidth [["allReadGroups"]] = list()
for (datasetCur in annotationDatasets(SNPhood.o)) {
SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]] = vector("list", nRegions(SNPhood.o))
SNPhood.o@internal$readWidth [["allReadGroups"]][[datasetCur]] = vector("list", nRegions(SNPhood.o))
for (readGroupCur in uniqueReadGroups) {
for (regionCur in seq_len(nRegions(SNPhood.o))) {
SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]][[regionCur]] =
c(SNPhood.o@internal$readStartPos[["allReadGroups"]][[datasetCur]][[regionCur]], SNPhood.o@internal$readStartPos[[readGroupCur]][[datasetCur]][[regionCur]])
SNPhood.o@internal$readWidth[["allReadGroups"]][[datasetCur]][[regionCur]] =
c(SNPhood.o@internal$readWidth[["allReadGroups"]][[datasetCur]][[regionCur]], SNPhood.o@internal$readWidth[[readGroupCur]][[datasetCur]][[regionCur]])
}
}
}
# Delete old references
for (j in uniqueReadGroups) {
SNPhood.o@internal$readStartPos[[j]] = NULL
SNPhood.o@internal$readWidth[[j]] = NULL
}
SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]] = list()
for (i in annotationDatasets(SNPhood.o)) {
isInitiated = FALSE
for (j in uniqueReadGroups) {
if (!isInitiated) { # Init
SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] = SNPhood.o@annotation$genotype$readsDerived[[j]][[i]]
isInitiated = TRUE
} else {
SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] = SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]][[i]] +
SNPhood.o@annotation$genotype$readsDerived[[j]][[i]]
}
}
}
#names(SNPhood.o@annotation$genotype$readsDerived[["allReadGroups"]])[i] = annotationDatasets(SNPhood.o)[i]
for (j in uniqueReadGroups) {
SNPhood.o@annotation$genotype$readsDerived[[j]] = NULL
}
# Remove the results from the allelicBias analysis
SNPhood.o@additionalResults$allelicBias = NULL
# Remove the clustering results completely if nothing is left
if (testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned)) {
SNPhood.o@additionalResults$clustering = NULL
}
# Make object consistent
SNPhood.o@annotation$readGroups = "allReadGroups"
SNPhood.o@internal$mergedReadGroups = TRUE
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Delete a particular set of read groups.
#'
#' \code{deleteReadGroups} deletes a particular set of read groups from a \emph{SNPhood} object.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name.
#'
#' @template SNPhood
#' @template readGroups
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with read counts across read groups (both for the slots readCountsUnbinned and readCountsBinned) replaced by their respective relative fractions. Otherwise identical to the input \code{\link{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_allelicFractions.o = deleteReadGroups(SNPhood.o, "ambiguous")
#' @seealso \code{\link{deleteDatasets}}, \code{\link{deleteRegions}}
#' @export
#' @import checkmate
deleteReadGroups <- function(SNPhood.o,
readGroups = NULL,
verbose = TRUE) {
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertSubset(readGroups, SNPhood.o@annotation$readGroups)
assertCharacter(readGroups, any.missing = FALSE, min.chars = 1, min.len = 0, max.len = nReadGroups(SNPhood.o) - 1, unique = TRUE)
if (nReadGroups(SNPhood.o) == 1) {
warning("Cannot delete any read groups because only one read group has been found (", annotationReadGroups(SNPhood.o),"). Returning an unmodified object.")
return(SNPhood.o)
}
maxLen = nReadGroups(SNPhood.o) - 1
if (!testCharacter(readGroups, any.missing = FALSE, min.len = 1, min.chars = 1, max.len = maxLen)) {
warning("Between one and ", maxLen," read groups must be specified. Returning an unmodified object.")
return(SNPhood.o)
}
stopifnot(nReadGroups(SNPhood.o) > length(readGroups))
for (readGroupCur in readGroups) {
SNPhood.o@readCountsUnbinned [[readGroupCur]] = NULL
SNPhood.o@readCountsBinned [[readGroupCur]] = NULL
SNPhood.o@enrichmentBinned [[readGroupCur]] = NULL
SNPhood.o@internal$sizeFactors [[readGroupCur]] = NULL
SNPhood.o@internal$readStartPos [[readGroupCur]] = NULL
SNPhood.o@internal$readWidth [[readGroupCur]] = NULL
}
SNPhood.o@annotation$readGroups = annotationReadGroups(SNPhood.o)[-which(annotationReadGroups(SNPhood.o) %in% readGroups)]
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
assertInt(nReadGroups(SNPhood.o), lower = 1)
SNPhood.o
}
# newReadGroupsMapping = list("a", "b", "c")
# names(newReadGroupsMapping) = annotationReadGroups(SNPhood.o)
#' Rename read groups.
#'
#' \code{renameReadGroups} renames a set of read groups from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newReadGroupsMapping Named list. Named list. For clarity of mapping, the names of the list must be the currently defined read group names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested read groups being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("a", "b", "c")
#' names(mapping) = annotationReadGroups(SNPhood.o)
#' SNPhood_mod.o = renameReadGroups (SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameDatasets}}, \code{\link{renameRegions}}
#' @import checkmate
#' @export
renameReadGroups <- function(SNPhood.o,
newReadGroupsMapping,
verbose = TRUE) {
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertList(newReadGroupsMapping, any.missing = FALSE, min.len = 1, max.len = nReadGroups(SNPhood.o))
assertSubset(names(newReadGroupsMapping), annotationReadGroups(SNPhood.o))
assertCharacter(unlist(newReadGroupsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)
for (i in seq_len(length(newReadGroupsMapping))) {
oldR = names(newReadGroupsMapping)[i]
newR = newReadGroupsMapping[[i]]
if (oldR == newR) next
if (testNull(newR)) {
stop("New name cannot be NULL for read group ", names(newReadGroupsMapping)[i])
}
# If the new name is already present in the object, make sure that the renaming does not cause issues
if (newR %in% annotationReadGroups(SNPhood.o)) {
stop("Cannot rename read group ",oldR, " to ", newR, " because the new name is already present in the object.")
}
names(SNPhood.o@readCountsUnbinned)[which(names(SNPhood.o@readCountsUnbinned) == oldR)] = newR
names(SNPhood.o@readCountsBinned)[which(names(SNPhood.o@readCountsBinned) == oldR)] = newR
names(SNPhood.o@enrichmentBinned)[which(names(SNPhood.o@enrichmentBinned) == oldR)] = newR
names(SNPhood.o@internal$sizeFactors) [which(names(SNPhood.o@internal$sizeFactors) == oldR)] = newR
names(SNPhood.o@internal$readStartPos)[which(names(SNPhood.o@internal$readStartPos) == oldR)] = newR
SNPhood.o@annotation$readGroups[which(SNPhood.o@annotation$readGroups == oldR)] = newR
names(SNPhood.o@annotation$genotype$readsDerived)[which(SNPhood.o@annotation$readGroups == oldR)] = newR
SNPhood.o@additionalResults$allelicBias$parameters$readGroupsTested [which(SNPhood.o@additionalResults$allelicBias$parameters$readGroupsTested == oldR)] = newR
SNPhood.o@additionalResults$allelicBias$parameters$readGroupsDiscarded[which(SNPhood.o@additionalResults$allelicBias$parameters$readGroupsDiscarded == oldR)] = newR
}
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Rename datasets.
#'
#' \code{renameDatasets} renames datasets from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newDatasetsMapping Named list. Named list. For clarity of mapping, the names of the list must be the currently defined dataset names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested datasets being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("Individual1", "Individual2")
#' names(mapping) = annotationDatasets(SNPhood.o)
#' SNPhood_mod.o = renameDatasets(SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameReadGroups}}, \code{\link{renameRegions}}
#' @export
#' @import checkmate
renameDatasets <- function(SNPhood.o,
newDatasetsMapping,
verbose = TRUE) {
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
assertList(newDatasetsMapping, any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
assertSubset(names(newDatasetsMapping), annotationDatasets(SNPhood.o))
assertCharacter(as.character(newDatasetsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)
for (i in seq_len(length(newDatasetsMapping))) {
oldR = names(newDatasetsMapping)[i]
newR = newDatasetsMapping[[i]]
if (oldR == newR) next
if (testNull(newR)) {
stop("New name cannot be NULL for dataset ", names(newDatasetsMapping)[i])
}
# If the new name is already present in the object, make sure that the renaming does not cause issues
if (newR %in% annotationDatasets(SNPhood.o)) {
stop("Cannot rename dataset ",oldR, " to ", newR, " because the new name is already present in the object.")
}
# SLOT ANNOTATION
names(SNPhood.o@annotation$files)[which(names(SNPhood.o@annotation$files) == oldR)] = newR
for (alleleCur in annotationReadGroups(SNPhood.o)) {
names(SNPhood.o@annotation$genotype$readsDerived[[alleleCur]])[which(names(SNPhood.o@annotation$genotype$readsDerived[[alleleCur]]) == oldR)] = newR
colnames(SNPhood.o@annotation$genotype$heterozygosity)[which(colnames(SNPhood.o@annotation$genotype$heterozygosity) == oldR)] = newR
names(SNPhood.o@readCountsUnbinned[[alleleCur]])[which(names(SNPhood.o@readCountsUnbinned[[alleleCur]]) == oldR)] = newR
names(SNPhood.o@readCountsBinned[[alleleCur]])[which(names(SNPhood.o@readCountsBinned[[alleleCur]]) == oldR)] = newR
names(SNPhood.o@enrichmentBinned[[alleleCur]])[which(names(SNPhood.o@enrichmentBinned[[alleleCur]]) == oldR)] = newR
names(SNPhood.o@internal$sizeFactors[[alleleCur]])[which(names(SNPhood.o@internal$sizeFactors[[alleleCur]]) == oldR)] = newR
names(SNPhood.o@internal$readStartPos[[alleleCur]])[which(names(SNPhood.o@internal$readStartPos[[alleleCur]]) == oldR)] = newR
names(SNPhood.o@internal$readWidth[[alleleCur]])[which(names(SNPhood.o@internal$readWidth[[alleleCur]]) == oldR)] = newR
}
if (!testNull(SNPhood.o@additionalResults$allelicBias)) {
names(SNPhood.o@additionalResults$allelicBias$pValue)[which(names(SNPhood.o@additionalResults$allelicBias$pValue) == oldR)] = newR
names(SNPhood.o@additionalResults$allelicBias$confIntervalMin)[which(names(SNPhood.o@additionalResults$allelicBias$confIntervalMin) == oldR)] = newR
names(SNPhood.o@additionalResults$allelicBias$confIntervalMax)[which(names(SNPhood.o@additionalResults$allelicBias$confIntervalMax) == oldR)] = newR
names(SNPhood.o@additionalResults$allelicBias$fractionEstimate)[which(names(SNPhood.o@additionalResults$allelicBias$fractionEstimate) == oldR)] = newR
names(SNPhood.o@additionalResults$allelicBias$background)[which(names(SNPhood.o@additionalResults$allelicBias$background) == oldR)] = newR
#TODO: dataset names
# TODO: change implementation: just have the names appear once and use [[1]] and [[2]] or so for it throughout the object
}
}
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Rename bins.
#'
#' \code{renameBins} renames bins from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newBinsMapping Named list. For clarity of mapping, the names of the list must be the currently defined bin names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested bins being renamed.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = list("Bin1_NEW")
#' names(mapping) = annotationBins(SNPhood.o)[1]
#' SNPhood_mod.o = renameBins(SNPhood.o, mapping)
#' @seealso \code{\link{deleteDatasets}}, \code{\link{deleteRegions}}
#' @export
#' @import checkmate
renameBins <- function(SNPhood.o,
newBinsMapping,
verbose = TRUE) {
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertList(newBinsMapping, any.missing = FALSE, min.len = 1, max.len = nBins(SNPhood.o))
assertSubset(names(newBinsMapping), annotationBins(SNPhood.o))
assertCharacter(unlist(newBinsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)
for (i in seq_len(length(newBinsMapping))) {
oldR = names(newBinsMapping)[i]
newR = newBinsMapping[[i]]
if (oldR == newR) next
if (testNull(newR)) {
stop("New name cannot be NULL for bin ", names(newBinsMapping)[i])
}
# If the new name is already present in the object, make sure that the renaming does not cause issues
if (newR %in% annotationBins(SNPhood.o)) {
stop("Cannot rename bin ",oldR, " to ", newR, " because the new name is already present in the object.")
}
SNPhood.o@annotation$bins[which(SNPhood.o@annotation$bins == oldR)] = newR
}
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Rename regions.
#'
#' \code{renameRegions} renames regions from a \emph{SNPhood} object.
#'
#' @template SNPhood
#' @param newRegionsMapping Named list. For clarity of mapping, the names of the list must be the currently defined region names,
#' and the values of each element the corresponding new ones.
#' @template verbose_TRUE
#' @return An object of class \code{\link{SNPhood}} with the requested regions being renamed
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' mapping = as.list(paste0(annotationRegions(SNPhood.o),".newName"))
#' names(mapping) = annotationRegions(SNPhood.o)
#' SNPhood_mod.o = renameRegions(SNPhood.o, mapping)
#' @seealso \code{\link{renameBins}}, \code{\link{renameDatasets}}, \code{\link{renameReadGroups}}
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges mcols<-
renameRegions <- function(SNPhood.o,
newRegionsMapping,
verbose = TRUE) {
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
assertList(newRegionsMapping, any.missing = FALSE, min.len = 1, max.len = nRegions(SNPhood.o))
assertSubset(names(newRegionsMapping), annotationRegions(SNPhood.o))
assertCharacter(unlist(newRegionsMapping), any.missing = FALSE, min.chars = 1, unique = TRUE)
for (i in seq_len(length(newRegionsMapping))) {
oldR = names(newRegionsMapping)[i]
newR = newRegionsMapping[[i]]
if (oldR == newR) next
if (testNull(newR)) {
stop("New name cannot be NULL for bin ", names(newRegionsMapping)[i])
}
# If the new name is already present in the object, make sure that the renaming does not cause issues
if (newR %in% annotationRegions(SNPhood.o)) {
stop("Cannot rename region ",oldR, " to ", newR, " because the new name is already present in the object.")
}
mcols(SNPhood.o@annotation$regions)$annotation[which(names(SNPhood.o@annotation$regions) == oldR)] = newR
names(SNPhood.o@annotation$regions) [which(names(SNPhood.o@annotation$regions) == oldR)] = newR
}
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Delete a set of user regions from a \emph{SNPhood} object.
#'
#' \code{deleteRegions} deletes a set of user regions.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name.
#'
#'
#' @template SNPhood
#' @template regions
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested regions being deleted.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' # Delete the first 10 regions
#' SNPhood_mod.o = deleteRegions(SNPhood.o, c(1:10))
#'
#' # Delete regions by their annotation
#' SNPhood_mod.o = deleteRegions(SNPhood.o, c("rs2822405", "rs467140"))
#' @section Warning:
#' \strong{Execution of this function resets the slot additionalResults and all of its results (e.g, allelic bias analysis).
#' The reason for this is that all results stored in this slot are affected by the deletion of regions}
#' @seealso \code{\link{deleteDatasets}}, \code{\link{deleteReadGroups}}
#' @export
#' @import checkmate
deleteRegions <- function(SNPhood.o,
regions,
verbose = TRUE) {
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
regions = .checkAndConvertRegionArgument(SNPhood.o, regions, nullAllowed = FALSE, maxLength = nRegions(SNPhood.o) - 1)
# SLOT annotation
SNPhood.o@annotation$regions = SNPhood.o@annotation$regions[-regions,]
# SLOT config: Nothing to do
# SLOT readCountsUnbinned, readCountsBinned, enrichmentBinned
for (alleleCur in annotationReadGroups(SNPhood.o)) {
for (fileCur in annotationDatasets(SNPhood.o)) {
SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]] = SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]][-regions]
SNPhood.o@readCountsBinned[[alleleCur]] [[fileCur]] = SNPhood.o@readCountsBinned[[alleleCur]] [[fileCur]][-regions,]
if (parameters(SNPhood.o)$normByInput) {
SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]] = SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]][-regions,]
}
# SLOT internal
if (!testNull(SNPhood.o@internal$readStartPos[[alleleCur]] [[fileCur]])) {
SNPhood.o@internal$readStartPos[[alleleCur]] [[fileCur]][regions] = NULL
SNPhood.o@internal$readWidth[[alleleCur]] [[fileCur]][regions] = NULL
}
}
}
# SLOT additionalResults
# Reset that, because all slots are affected by the deletion of regions
SNPhood.o@additionalResults = list()
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Delete a particular set of datasets from a \emph{SNPhood} object.
#'
#' \code{deleteDatasets} deletes a particular set of datasets from a \code{\link{SNPhood}} object.
#' Removal is irreversible. It is therefore recommended to save the resulting \code{\link{SNPhood}} object with a new name because the deleted datasets cannot be recovered.
#'
#' @template SNPhood
#' @template datasets
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with the requested datasets removed from all slots.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_mod.o = deleteDatasets(SNPhood.o, c(1,2))
#' @seealso \code{\link{deleteRegions}}, \code{\link{deleteReadGroups}}
#' @export
#' @import checkmate
deleteDatasets <- function(SNPhood.o,
datasets = NULL,
verbose = TRUE) {
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assert(checkNull(datasets),
checkIntegerish(datasets, lower = 1, upper = nBins(SNPhood.o), unique = TRUE, min.len = 1, max.len = nBins(SNPhood.o) - 1),
checkCharacter(datasets, min.len = 1, max.len = nDatasets(SNPhood.o) - 1, unique = TRUE)
)
if (testCharacter(datasets)) {
assertSubset(datasets, annotationDatasets(SNPhood.o))
}
if (testNull(datasets)) {
warning("No datasets have been provided for deletion. Object will remain unchanged.")
return(SNPhood.o)
}
# Convert integer to names
if (testIntegerish(datasets, lower = 1, upper = nBins(SNPhood.o), unique = TRUE, min.len = 1, max.len = nBins(SNPhood.o) - 1)) {
datasets = annotationDatasets(SNPhood.o)[datasets]
}
# SLOT annotation
SNPhood.o@annotation$files = annotation(SNPhood.o)$files[-which(names(annotation(SNPhood.o)$files) %in% datasets)]
# Slot config: Nothing to do
# SLOT enrichmentBinned, readCountsUnbinned, and readCountsBinned, additionalResults
for (alleleCur in annotationReadGroups(SNPhood.o)) {
for (fileCur in datasets) {
if (parameters(SNPhood.o)$normByInput) {
SNPhood.o@enrichmentBinned[[alleleCur]] [[fileCur]] = NULL
}
SNPhood.o@annotation$genotype$readsDerived[[alleleCur]][[fileCur]] = NULL
SNPhood.o@readCountsUnbinned[[alleleCur]] [[fileCur]] = NULL
SNPhood.o@readCountsBinned [[alleleCur]] [[fileCur]] = NULL
if (!testNull(SNPhood.o@additionalResults$allelicBias)) {
SNPhood.o@additionalResults$allelicBias$pValue[fileCur] = NULL
SNPhood.o@additionalResults$allelicBias$confIntervalMin[fileCur] = NULL
SNPhood.o@additionalResults$allelicBias$confIntervalMax[fileCur] = NULL
SNPhood.o@additionalResults$allelicBias$fractionEstimate[fileCur] = NULL
SNPhood.o@additionalResults$allelicBias$background[fileCur] = NULL
}
# Delete clustering results
if (!testNull(SNPhood.o@additionalResults$clustering$enrichmentBinned[[alleleCur]][[fileCur]])) {
SNPhood.o@additionalResults$clustering[[alleleCur]][[fileCur]] = NULL
SNPhood.o@additionalResults$clustering[[alleleCur]][[fileCur]] = NULL
}
# SLOT INTERNAL
if (!testNull(SNPhood.o@internal$readStartPos[[alleleCur]][[fileCur]])) {
SNPhood.o@internal$readStartPos[[alleleCur]][[fileCur]] = NULL
SNPhood.o@internal$readWidth[[alleleCur]][[fileCur]] = NULL
}
}
}
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Get the annotation(names) of bins in a \emph{SNPhood} object.
#'
#' \code{annotationBins2} is a helper function that returns annotation of the bins that are defined in the \code{\linkS4class{SNPhood}} object.
#'
#' @template SNPhood
#' @param regions Integer or character. Default NULL. A subset of the SNP regions for which annotation is needed. Either the row numbers or the rownames(IDs) of the SNP regions are supported.
#' @param fullAnnotation Logical(1). Should the full annotation(as a data.frame) be returned or only the annotation of the bins(as a character vector)?
#' @template verbose_TRUE
#' @return If \code{fullAnnotation} is set to \code{TRUE}, a data.frame with the full annotation of the bins for the(subset of) SNP regions is returned. Otherwise, a character vector with only the annotation of the bins is returned.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' annotation.df = annotationBins2(SNPhood.o, regions = 1:10, fullAnnotation = TRUE)
#' annotation.vec = annotationBins2(SNPhood.o, regions = 1:10, fullAnnotation = FALSE)
#' @section Warning:
#' \strong{The number of returned bins can easily be very large, in the order of millions.
#' Be careful because the memory consumption due the resulting object may increase considerably.}
#' Reduce memory requirements by returning only a subset of SNP regions
#' @export
#' @import checkmate
#' @import GenomicRanges
# @importFrom GenomicRanges GRanges mcols
annotationBins2 <- function(SNPhood.o,
regions = NULL,
fullAnnotation = FALSE,
verbose = TRUE) {
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
regions = .checkAndConvertRegionArgument(SNPhood.o, regions, nullAllowed = TRUE, maxLength = NULL)
assertFlag(fullAnnotation)
assertFlag(verbose)
# Grep the rows that the user wants
if (!testNull(regions)) {
userRegions.red.gr = SNPhood.o@annotation$regions[regions]
} else {
userRegions.red.gr = SNPhood.o@annotation$regions
}
res.l = .createBins(userRegions.red.gr, parameters(SNPhood.o), applyLastBinStrategy = TRUE, .getGenomeData(parameters(SNPhood.o)$assemblyVersion), verbose = TRUE)
binsPerRegion.vec = table(res.l$bins$origRegionIndex)
nBinsPerRegion = unique(binsPerRegion.vec)
stopifnot( length(nBinsPerRegion) == 1)
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
# Create a general file format out of the list and call transformation helper function afterwards
if (fullAnnotation) {
bins.df = data.frame(chr = res.l$bins$chr,
start = res.l$bins$start,
end = res.l$bins$end,
annotationBin = paste0(mcols(userRegions.red.gr)$annotation[res.l$bins$origRegionIndex],"_bin", rep(1:nBinsPerRegion, length(userRegions.red.gr))),
annotationRegion = res.l$bins$origRegionIndex,
length = res.l$bins$length,
strand = res.l$bins$strand,
stringsAsFactors = TRUE
)
#
#
# if (!is.null(addScoreColumn)) {
#
# #Create the subset of rows that contain our regions
# includeScoreColumn.red.m = addScoreColumn [which(rownames(addScoreColumn) %in% mcols(userRegions.red.gr)$annotation[unique(res.l$bins$origRegionIndex)]),]
#
# assertInt(nrow(includeScoreColumn.red.m), lower=1)
#
# # Put the elements of the matrix into the correct order and into the bins.df dataframe(score column)
#
# #Check if the order is indeed identical
# identical(rownames(includeScoreColumn.red.m), mcols(userRegions.red.gr)$annotation)
#
# bins.df$score = as.vector(t(includeScoreColumn.red.m))
#
# }
return(bins.df)
} else {
return(paste0(mcols(userRegions.red.gr)$annotation[res.l$bins$origRegionIndex],"_bin", rep(1:nBinsPerRegion, length(userRegions.red.gr))))
}
}
#' Convert read counts across read groups to relative fractions from a \emph{SNPhood} object.
#'
#' \code{convertToAllelicFractions} convert read counts across read groups to their relative fractions among all
#' read groups (all read counts will be between 0 and 1, with 1 for a particular read group depicting that all reads
#' from this particular position originate from the one read group)
#' Affected slots are \code{readCountsUnbinned} and \code{readCountsBinned}.
#' It is recommended to save the resulting \code{\link{SNPhood}} object with a new name because it is not possible to go back from
#' fractions to read counts at a later point.
#'
#' @template SNPhood
#' @param roundDigits Numeric(1). Default 2. Number of digits after the decimal place when converting read counts to fractions
#' @param setNaNToZero Logical(1). Default FALSE. Should NaN (not a number) be converted to 0? NaN may result from individual regions or
#' bins with no reads across all read groups due to a division by zero.
#' @template verbose_TRUE
#' @return an object of class \code{\link{SNPhood}} with read counts across read groups (both for the slots readCountsUnbinned and
#' readCountsBinned) replaced by their respective relative fractions. Otherwise identical to the input \code{\link{SNPhood}} object.
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood_allelicFractions.o = convertToAllelicFractions(SNPhood.o)
#'
#' # Convert all NaN to 0 for subsequent analyses
#' SNPhood_allelicFractions.o = convertToAllelicFractions(SNPhood.o, setNaNToZero = TRUE)
#'
#' @seealso \code{\link{deleteReadGroups}}
#' @export
#' @import checkmate
convertToAllelicFractions <- function(SNPhood.o,
roundDigits = 2,
setNaNToZero = FALSE,
verbose = TRUE) {
# Check types and validity of arguments
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertInt(roundDigits, lower = 0)
assertFlag(setNaNToZero)
if (SNPhood.o@internal$isAllelicRatio) {
warning("Allelic ratios have already been computed. Object will remain unchanged.")
return(SNPhood.o)
}
if (SNPhood.o@internal$countType == "enrichment") {
warning("Cannot calculate allelic fractions for enrichments.")
return(SNPhood.o)
}
if (nReadGroups(SNPhood.o) == 1) {
warning("Only one read group has been found (", annotationReadGroups(SNPhood.o),"), cannot discard anything.")
return(SNPhood.o)
}
nDatasetsCur = nDatasets(SNPhood.o)
nReadGroupsCur = nReadGroups(SNPhood.o)
for (i in seq_len(nDatasetsCur)) {
# Calculate the sum of the number of reads per bin
sumBins.m = 0
sumRegions.vec = 0
for (j in 1:nReadGroupsCur) {
sumBins.m = sumBins.m + SNPhood.o@readCountsBinned [[j]] [[i]]
sumRegions.vec = sumRegions.vec + SNPhood.o@readCountsUnbinned[[j]] [[i]]
}
# Calculate the fraction
for (j in 1:nReadGroupsCur) {
SNPhood.o@readCountsUnbinned[[j]] [[i]] = round(SNPhood.o@readCountsUnbinned [[j]] [[i]] / sumRegions.vec, digits = roundDigits)
SNPhood.o@readCountsBinned [[j]] [[i]] = round(SNPhood.o@readCountsBinned [[j]] [[i]] / sumBins.m, digits = roundDigits)
if (setNaNToZero) {
SNPhood.o@readCountsUnbinned[[j]] [[i]][is.nan(SNPhood.o@readCountsUnbinned[[j]] [[i]])] = 0
SNPhood.o@readCountsBinned [[j]] [[i]][is.nan(SNPhood.o@readCountsBinned [[j]] [[i]])] = 0
}
}
}
SNPhood.o@internal$isAllelicRatio = TRUE
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
SNPhood.o
}
#' Associate genotypes with user regions from a \emph{SNPhood} object.
#'
#' The function \code{associateGenotypes} associates genotypes with SNP regions as defined in a \code{\linkS4class{SNPhood}} object. It is possible to assign
#' genotypes only for a subset of datasets as defined in a \code{\linkS4class{SNPhood}} object.
#' To avoid any ambiguities, a 1:1 for genotype and dataset mapping must be given (ses below).
#' @template SNPhood
#' @param genotypeMapping Data frame. A data frame that establishes the mapping between datasets in the object and the corresponding genotype file and
#' column names. See the examples.
#' must be provided. See the Vignette for a more detailed description of the supported file format.
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with the genotype information added to the slot \code{annotation}, element genotype.
#' You may retrieve it via the accessor function \code{\link{annotation}}.
#' @export
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' fileGenotypes = list.files(pattern = "*genotypes*",system.file("extdata", package = "SNPhoodData"), full.names = TRUE)
#' mapping = data.frame(samples = annotationDatasets(SNPhood.o), genotypeFile = rep(fileGenotypes, 2), sampleName = c("NA10847", "NA12890"))
#' SNPhood.o = associateGenotypes(SNPhood.o, mapping)
#' @import checkmate
associateGenotypes <- function(SNPhood.o,
genotypeMapping,
verbose = TRUE) {
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
disableIntegrityChecking = SNPhood.o@internal$disableObjectIntegrityChecking
SNPhood.o@internal$disableObjectIntegrityChecking = TRUE
if (SNPhood.o@config$onlyPrepareForDatasetCorrelation) {
stop(.getErrorForOnlyPrepareSamplesCorrelation())
}
assertDataFrame(genotypeMapping, types = c("character", "factor"), any.missing = FALSE, min.rows = 1, min.cols = 3)
#assertSubset(genotypeMapping$samples, annotationDatasets(SNPhood.o), empty.ok = FALSE)
assertFlag(verbose)
genotypeMapping[] <- lapply(genotypeMapping, as.character)
assertSubset(c("samples", "genotypeFile", "sampleName"), colnames(genotypeMapping))
assertCharacter(genotypeMapping$samples, unique = TRUE)
# Init the data frame that stores the external genotypes
res.df = data.frame(alleleRef = rep(NA, nRegions(SNPhood.o)),
alleleAlt = NA,
genotypeMismatch = FALSE)
SNPhood.o@annotation$genotype$external = res.df
# Delete NA in the list if there are any
genotypeMapping = genotypeMapping[!is.na(genotypeMapping$samples),]
# Check if the files all exist and are readable
sapply(unique(genotypeMapping$genotypeFile), FUN = function(x) assertFileExists(x, access = "r"))
# Parse VCF files
for (file_vcf in unique(genotypeMapping$genotypeFile)) {
if (verbose) message("\n Import genotypes from file ", file_vcf,". This may take a while...\n")
# Get a list of all sample names that we have to extract from the file
sampleNamesFile = genotypeMapping$sampleName[which(genotypeMapping$genotypeFile == file_vcf)]
sampleNamesObject = genotypeMapping$samples [which(genotypeMapping$genotypeFile == file_vcf)]
SNPhood.o = .extractGenotypesVCF(SNPhood.o, file_vcf, sampleNamesObject, sampleNamesFile, yieldSize = NULL, verbose = verbose)
}
if (verbose) message("The individual genotypes have been added to the annotation.")
# Add genotype information to annotation
for (individualCur in annotationDatasets(SNPhood.o)) {
# If not defined, set it to NA
index = which(genotypeMapping$samples == individualCur)
if (length(index) == 0) {
SNPhood.o@annotation$files[[individualCur]]$genotypeFile = NA
SNPhood.o@annotation$files[[individualCur]]$genotypeFileSampleName = NA
}
SNPhood.o@annotation$files[[individualCur]]$genotypeFile = genotypeMapping$genotypeFile[index]
SNPhood.o@annotation$files[[individualCur]]$genotypeFileSampleName = genotypeMapping$sampleName [index]
}
SNPhood.o = .identifyGenotypeIncompatibilities(SNPhood.o, verbose = verbose)
SNPhood.o@internal$disableObjectIntegrityChecking = disableIntegrityChecking
return(SNPhood.o)
}
#' Disable object integrity checking for a \emph{SNPhood} object.
#'
#' The function \code{changeObjectIntegrityChecking} disables object integrity checking for a \emph{SNPhood} object.
#' This might be desired for large objects when the integrity test takes too much time.
#' Note, however, that disabling these checks is not recommended.
#' @template SNPhood
#' @param disable Logical(1). Default FALSE. Disable the object integrity checking?
#' @template verbose_TRUE
#' @return Object of class \code{\linkS4class{SNPhood}} with object integrity checking disabled.
#' @export
#' @examples
#' data(SNPhood.o, package="SNPhood")
#' SNPhood.o = changeObjectIntegrityChecking(SNPhood.o, disable = TRUE)
#' @import checkmate
changeObjectIntegrityChecking <- function(SNPhood.o,
disable = FALSE,
verbose = TRUE) {
assertFlag(verbose)
.checkObjectValidity(SNPhood.o, verbose = verbose)
SNPhood.o@internal$disableObjectIntegrityChecking = disable
if (disable) {
message("Object integrity checking disabled for the specific object.")
} else {
message("Object integrity checking enabled for the specific object.")
}
SNPhood.o
}
#' @import checkmate
# TODO: Change object copying
.identifyGenotypeIncompatibilities <- function(SNPhood.o,
verbose = TRUE) {
nMismatchesTotal = 0
mismatches.l = list()
# Do some quality checking and set all genotypes to NA if the information from the VCF file and the read-derived files differ
for (fileCur in annotationDatasets(SNPhood.o)) {
if (is.na(annotation(SNPhood.o)$files[[fileCur]]$genotypeFile) | is.na(annotation(SNPhood.o)$files[[fileCur]]$genotypeFileSampleName)) {
next
}
if (verbose) message(" Check for genotype mismatches for file ",fileCur)
nMismatchesFile = 0
mismatches.l[[fileCur]] = c()
for (regionCur in 1:nRegions(SNPhood.o)) {
# Sum up all genotypes across read groups
for (alleleCur in 1:nReadGroups(SNPhood.o)) {
dist = annotation(SNPhood.o)$genotype$readsDerived[[annotationReadGroups(SNPhood.o)[alleleCur]]][[fileCur]][, regionCur]
if (alleleCur == 1) {
readDerived = dist
} else {
readDerived = readDerived + dist
}
}
readDerivedPresent = as.character(names(readDerived[readDerived > 0]))
if (length(readDerivedPresent) == 0) {
next
}
# Compare with information from VCF file
readPossVCF = as.character(annotation(SNPhood.o)$genotype$external[regionCur, c("alleleRef","alleleAlt")])
if (all(is.na(readPossVCF)) | all(is.na(readDerivedPresent))) {
next
}
if (length(intersect(readDerivedPresent, readPossVCF)) == 0) {
nMismatchesFile = nMismatchesFile + 1
nMismatchesTotal = nMismatchesTotal + 1
mismatches.l[[fileCur]] = c(mismatches.l[[fileCur]], regionCur)
# Flag region
SNPhood.o@annotation$genotype$external$genotypeMismatch[regionCur] = TRUE
}
}
if (verbose) message("Found ",nMismatchesFile, " mismatches between the genotypes from the BAM file reads and the VCF information for sample ",fileCur)
}
if (verbose) message("Found ",nMismatchesTotal, " mismatches between the genotypes from the BAM file reads and the VCF information across all files")
SNPhood.o@annotation$genotype$mismatches = mismatches.l
return(SNPhood.o)
}
#' @import checkmate
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom VariantAnnotation scanVcfHeader ScanVcfParam readVcf geno alt ref samples
#' @import Rsamtools
# @importFrom Rsamtools bgzip indexTabix TabixFile
#' @import GenomicRanges
#' @importFrom SummarizedExperiment rowRanges
# #' @importFrom Biostrings unlist
# TODO: remove the object as argument to prevent copying
.extractGenotypesVCF <- function(SNPhood.o,
file_vcf,
sampleNamesObject,
sampleNamesFile,
yieldSize = NULL,
verbose = TRUE) {
# Check types and validity of arguments
assertFileExists(file_vcf, access = "r")
assertCharacter(sampleNamesObject, any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
assertCharacter(sampleNamesFile , any.missing = FALSE, min.len = 1, max.len = nDatasets(SNPhood.o))
assert(checkNull(yieldSize),
checkIntegerish(yieldSize, lower = 100, len = 1)
)
if (is.null(yieldSize)) {
yieldSizePar = NA_integer_
} else {
yieldSizePar = yieldSize
}
genotypes.l = list()
# Either an existing uncompressed or gz-compressed file
compressVcf <- bgzip(file_vcf, tempfile())
idx <- indexTabix(compressVcf, "vcf")
tab <- TabixFile(compressVcf, index = idx, yieldSize = yieldSizePar)
# Scan the header first and get a list of sample names, and only parse those that intersect with the names in the SNPhood object
hdr = scanVcfHeader(file_vcf)
samplesToExtract = intersect(samples(hdr), unique(sampleNamesFile))
samplesMissing = setdiff(unique(sampleNamesFile), samples(hdr))
if (length(samplesToExtract) == 0) {
stop("No common sample names found between the provided sample names for the SNPhood samples (",unique(sampleNamesFile),") and the samples defined in the file ",
file_vcf, "(", paste0(samples(hdr),collapse = ","),")")
}
if (length(samplesToExtract) < length(unique(sampleNamesFile))) {
warning("Could not extract genotypes for all samples. The following samples were not found in the file ", file_vcf, ": ",paste0(samplesMissing, collapse = ","))
}
if (verbose) message(" Common samples found in file ", file_vcf,". Extract genotypes for samples ",paste0(samplesToExtract, collapse = ","),".")
# Replace "chr" to make sure that the chromosome names are identical by parsing the first lines and check
# TODO: make this more general and determine if necessary automatically
SNPs.gr = .getSNPGRangesObj(annotation(SNPhood.o))
seqlevels(SNPs.gr) <- sub("chr", "", seqlevels(SNPs.gr))
# Make the parsing as fast as possible. Only read in the GT field (genotype), and skip all info fields
param = ScanVcfParam(
fixed = c("ALT"),
info = NA,
geno = "GT",
samples = samplesToExtract,
trimEmpty = TRUE,
which = SNPs.gr
)
# Parse VCF file
vcf <- readVcf(tab, genome = parameters(SNPhood.o)$assemblyVersion, param = param)
if (length(rowRanges(vcf)) == 0) {
warning("Could not find genotype information for any of the SNPs.")
return(SNPhood.o)
}
if (length(geno(vcf)) == 0 | !"GT" %in% names(geno(vcf))) {
stop("Could not find or extract genotype information from the file ",file_vcf,". Does the file comply to the VCF standard and contains the required header lines?")
}
if (length(which(!mcols(rowRanges(vcf))$paramRangeID %in% names(SNPs.gr)) == FALSE) > 0) {
warning("Note: At least one SNP ID in the VCF file differs from the SNP ID in the SNPhood object despite identical positions.")
}
if (verbose) message(" Found and imported ",length(rowRanges(vcf))," SNPs with genotype information across all samples out of all ",nRegions(SNPhood.o), " SNPs as defined in the SNPhood object.")
# How does it work with yieldSize and the loop, use it at all or not necessary?
# open(tab)
# while (nrow(vcf_yield <- readVcf(tab, genome=parameters(SNPhood.o)$assemblyVersion, param=param)))
# cat("vcf dim:", dim(vcf_yield), "\n")
# close(tab)
assertInteger(length(rowRanges(vcf)), lower = 1, len = 1, any.missing = FALSE)
# Get a mapping of which SNP the entries correspond to
mapping = sapply(seq_len(length(rowRanges(vcf))), function(x) {which(start(rowRanges(vcf))[x] == start(SNPs.gr))})
ref.vec = rep(NA, length(ref(vcf)))
ref.vec[mapping] = as.character(ref(vcf))
not.na.index = which(!is.na(ref.vec))
# If the REF allele is NA in the object, it has not yet been assigned.
indexInit = which(is.na(SNPhood.o@annotation$genotype$external$alleleRef[mapping]))
SNPhood.o@annotation$genotype$external$alleleRef[mapping[indexInit]] = ref.vec[mapping][indexInit]
indexToTest = intersect(which(!is.na(SNPhood.o@annotation$genotype$external$alleleRef)), not.na.index)
if (length(indexToTest) > 0) {
indexMismatch = which(ref.vec[indexToTest] != SNPhood.o@annotation$genotype$external$alleleRef[indexToTest])
# Genotype mismatch found, set to NA and mark as mismatch
if (length(indexMismatch) > 0) {
SNPhood.o@annotation$genotype$external$genotypeMismatch[indexToTest[indexMismatch]] = TRUE
SNPhood.o@annotation$genotype$external$alleleRef[indexToTest[indexMismatch]] = NA
warning("For ", length(indexMismatch), " SNPs (",paste0(rownames(SNPhood.o@annotation$genotype$external)[indexMismatch],collapse = ","), "), a genotype mismatch for the reference allele was found as compared to previously parsed VCF files. The reference allele has been set to NA for these SNPs, and they have been marked in the column genotypeMismatch")
}
}
alt.vec = rep(NA, length(alt(vcf)))
alt.vec [mapping] = as.character(unlist(sapply(seq_len(length(alt(vcf))), function(x) {paste0(as.character(unlist(alt(vcf)[x])), collapse = ",")})))
alt.vec = unlist(alt.vec)
not.na.index = which(!is.na(alt.vec))
# If the ALL allele is NA in the object, it has not yet been assigned.
indexInit = which(is.na(SNPhood.o@annotation$genotype$external$alleleAlt[mapping]))
SNPhood.o@annotation$genotype$external$alleleAlt[mapping[indexInit]] = alt.vec[mapping][indexInit]
indexToTest = intersect(which(!is.na(SNPhood.o@annotation$genotype$external$alleleAlt)), not.na.index)
if (length(indexToTest) > 0) {
# Get a list of all ALT alleles per SNP
alleles.alt.l = strsplit(SNPhood.o@annotation$genotype$external$alleleAlt[indexToTest],",")
# Append the new ones and make them unique.
for (i in seq_len(length(indexToTest))) {
alleles.alt.l[[i]] = unique(c( alleles.alt.l[[i]], unlist(strsplit(alt.vec[indexToTest[i]],",")) ))
}
# Convert back
SNPhood.o@annotation$genotype$external$alleleAlt[indexToTest] = sapply(alleles.alt.l,paste0,collapse = ",")
}
assertSubset(SNPhood.o@annotation$genotype$external$alleleRef, c("A","C","G","T",NA))
assertSubset(SNPhood.o@annotation$genotype$external$alleleAlt, c("A","C","G","T",NA))
for (i in seq_len(length(samplesToExtract))) {
colIndex = which(colnames(geno(vcf)$GT) == samplesToExtract[i])
colname = paste0(file_vcf, ":", samplesToExtract[i])
assertIntegerish(colIndex, lower = 1, upper = length(colnames(geno(vcf)$GT)), any.missing = FALSE)
# Add a new column, initially all NA
SNPhood.o@annotation$genotype$external[,colname] = NA
# Only change for the SNPs we have genotypes for
SNPhood.o@annotation$genotype$external[mapping, colname] = geno(vcf)$GT[, colIndex]
# Set to NA if "."
SNPhood.o@annotation$genotype$external[which(SNPhood.o@annotation$genotype$external[colname] == "."), colname] = NA
}
# TODO: genotypes.l
SNPhood.o
}
#' @import checkmate
.checkAndConvertRegionArgument <- function(SNPhood.o,
regions,
nullAllowed = TRUE,
maxLength = NULL) {
if (testNull(maxLength)) maxLength = nRegions(SNPhood.o)
minLength = ifelse(nullAllowed, 0, 1)
assert(checkNull(regions),
checkCharacter(regions, any.missing = FALSE, min.chars = 1, min.len = minLength, max.len = maxLength, unique = TRUE),
checkIntegerish(regions, lower = 1, upper = nRegions(SNPhood.o),
min.len = minLength, max.len = maxLength, unique = TRUE)
)
if (!nullAllowed) {
assert(!testNull(regions))
}
# Check if names or indizes are provided, and convert to row numbers
if (testCharacter(regions)) {
assertSubset(regions, annotationRegions(SNPhood.o))
regions = which(annotationRegions(SNPhood.o) %in% regions)
}
regions
}
#' @import checkmate
.checkAndConvertReadGroupArgument <- function(SNPhood.o,
readGroup) {
if (nReadGroups(SNPhood.o) == 1) {
assert(checkNull(readGroup),
checkChoice(readGroup, annotationReadGroups(SNPhood.o)))
if (testNull(readGroup)) {
readGroup = annotationReadGroups(SNPhood.o)
}
} else {
assertChoice(readGroup, annotationReadGroups(SNPhood.o))
}
readGroup
}
#' @import checkmate
.checkAndConvertDatasetArgument <- function(SNPhood.o,
datasets,
nullAllowed = TRUE,
maxLength = NULL,
returnNames = TRUE) {
if (testNull(maxLength)) maxLength = nDatasets(SNPhood.o)
minLength = ifelse(nullAllowed, 0, 1)
assert(checkNull(datasets),
checkCharacter(datasets, any.missing = FALSE, min.chars = 1, min.len = minLength, max.len = maxLength, unique = TRUE),
checkIntegerish(datasets, lower = 1, upper = nDatasets(SNPhood.o),
min.len = minLength, max.len = maxLength, unique = TRUE)
)
if (!nullAllowed) {
assert(!testNull(datasets))
}
# Check if names or indizes are provided, and convert to row numbers
if (testCharacter(datasets)) {
checkSubset(datasets, annotationDatasets(SNPhood.o))
datasets = which(annotationDatasets(SNPhood.o) %in% datasets)
}
if (returnNames) {
datasets = annotationDatasets(SNPhood.o)[datasets]
}
datasets
}
.getErrorForOnlyPrepareSamplesCorrelation <- function() {
paste0('The parameter "onlyPrepareForDatasetCorrelation" in the main function "analyzeSNPhood" has been set to TRUE.',
' Cannot execute the function because of this. Run the function "analyzeSNPhood" again and set "onlyPrepareForDatasetCorrelation" to FALSE (the default).')
}
.getListOfSupportedParameters <- function() {
list(
readFlag_isPaired = "logical",
readFlag_isProperPair = "logical",
readFlag_isUnmappedQuery = "logical",
readFlag_hasUnmappedMate = "logical",
readFlag_isMinusStrand = "logical",
readFlag_isMateMinusStrand = "logical",
readFlag_isFirstMateRead = "logical",
readFlag_isSecondMateRead = "logical",
readFlag_isNotPrimaryRead = "logical",
readFlag_isNotPassingQualityControls = "logical",
readFlag_isDuplicate = "logical",
readFlag_reverseComplement = "logical",
readFlag_simpleCigar = "logical",
path_userRegions = "character",
zeroBasedCoordinates = "logical",
regionSize = "integer",
binSize = "integer",
readGroupSpecific = "logical",
strand = "character",
startOpen = "logical",
endOpen = "logical",
headerLine = "logical",
linesToParse = "integer",
lastBinTreatment = "character",
assemblyVersion = "character",
nCores = "integer",
keepAllReadCounts = "logical",
normByInput = "logical",
normAmongEachOther = "logical",
poolDatasets = "logical"
)
}
.initSNPhoodObject <- function(SNPhood.o,
files.df,
readGroups,
onlyPrepareForDatasetCorrelation) {
par.l = parameters(SNPhood.o)
nRegionsCur = nRegions(SNPhood.o)
stopifnot(!SNPhood.o@internal$isInitialized)
assertCharacter(readGroups, min.len = 1, min.chars = 1)
SNPhood.o@annotation$readGroups = readGroups
for (alleleCur in readGroups) {
SNPhood.o@readCountsUnbinned[[alleleCur]] = list()
SNPhood.o@readCountsBinned[[alleleCur]] = list()
SNPhood.o@internal$sizeFactors[[alleleCur]] = list()
SNPhood.o@enrichmentBinned[[alleleCur]] = list()
SNPhood.o@annotation$genotype$readsDerived [[alleleCur]] = list()
SNPhood.o@internal$readStartPos[[alleleCur]] = list()
SNPhood.o@internal$readWidth[[alleleCur]] = list()
# Iterate over all unique individual input files and signal files
if (!all(is.na(files.df$input))) {
inputFiles.vec = c(unique(files.df$input), unique(unlist(strsplit(files.df$input, ",", fixed = TRUE))))
} else {
inputFiles.vec = c()
}
signalFiles.vec = unique(c(files.df$signal, files.df$individual))
# Init the matrices
for (filesCur in c(inputFiles.vec, signalFiles.vec)) {
if (is.na(filesCur)) {
next
}
SNPhood.o@readCountsUnbinned[[alleleCur]] [[filesCur]] = rep(0, nRegionsCur)
# Some slots are only needed for signal files and not input files
if (filesCur %in% signalFiles.vec) {
# Moved to .extractAndNormalize
#SNPhood.o@annotation$genotype$readsDerived[[alleleCur]] [[filesCur]] = array(data = rep(0,4), dim = c(4, nRegionsCur), dimnames = list(c("A", "C", "G", "T")))
if (nReadGroups(SNPhood.o) > 1) {
SNPhood.o@internal$readStartPos[[alleleCur]][[filesCur]] = vector("list", nRegionsCur)
SNPhood.o@internal$readWidth [[alleleCur]][[filesCur]] = vector("list", nRegionsCur)
} else {
SNPhood.o@internal$readStartPos[[alleleCur]][[filesCur]] = NULL
SNPhood.o@internal$readWidth [[alleleCur]][[filesCur]] = NULL
}
}
if (!onlyPrepareForDatasetCorrelation) {
SNPhood.o@readCountsBinned[[alleleCur]] [[filesCur]] = matrix(0, nrow = nRegionsCur, ncol = nBins(SNPhood.o))
}
}
# Enrichment matrices are only stored for the final individuals, not the individual files
if (!onlyPrepareForDatasetCorrelation) {
if (par.l$normByInput) {
for (filesCur in unique(files.df$individual)) {
SNPhood.o@enrichmentBinned[[alleleCur]] [[filesCur]] = matrix(0, nrow = nRegionsCur, ncol = nBins(SNPhood.o))
}
}
}
} # end for (alleleCur in readGroups)
SNPhood.o@internal$isInitialized = TRUE
SNPhood.o
}
#' @importFrom methods new
.createSNPhoodObject <- function(par.l,
onlyPrepareForDatasetCorrelation) {
SNPhood.o <- new("SNPhood",
annotation = list(regions = NULL,
genotype = list(readsDerived = list()),
files = list(),
readGroups = NA
),
config = par.l,
enrichmentBinned = list(),
readCountsUnbinned = list(),
readCountsBinned = list(),
internal = list(disableObjectIntegrityChecking = TRUE,
isAllelicRatio = FALSE ,
mergedReadGroups = FALSE,
addResultsElementsAdded = character(),
readStartPos = list(),
readWidth = list(),
sizeFactors = list(),
globalBackground = list(),
countType = NA,
plot_origBinSNPPosition = NA,
plot_labelBins = NA,
isInitialized = FALSE
),
additionalResults = list()
)
SNPhood.o
}
.BAMHeaderConsistencyChecks <- function(par.l,
annotationRegions,
signalFileCur,
expectedReadGroups = NULL,
verbose = TRUE) {
assertCharacter(signalFileCur, any.missing = FALSE)
assertFileExists(signalFileCur, access = "r")
assertFlag(verbose)
chrSizes.df = .getGenomeData(par.l$assemblyVersion, includeChrM = TRUE)
# Check if the index file is present for all BAM files. Use of which in the ScanBamParam function requires that a BAM index file (<filename>.bai) exists.
# Indexing provides two significant benefits. First, an index allows a BAM file to be efficiently accessed by range. A corollary is that providing a which argument to ScanBamParam requires an index. Second, coordinates for extracting information from a BAM file can be derived from the index, so a portion of a remote BAM file can be retrieved with local access only to the index.
.checkAndCreateIndexFile(signalFileCur)
if (verbose) message(" Parse BAM header...")
# Examine the BAM file header to extract the sequence names and read groups
res = scanBamHeader(signalFileCur)
chrSizes = res[[signalFileCur]]$targets
if (length(chrSizes) == 0) {
stop("The chromosome sizes are not specified in the BAM file ", signalFileCur,". No automated verification of the correct genome assembly version can be done. Carefully check if the genome asselbly version has been specified correctly.", sep = "")
}
definedChr = unique(as.character(seqnames(annotationRegions)))
unknownChr = which(!names(chrSizes) %in% definedChr)
if (length(unknownChr) > 0) {
chrSizes = chrSizes[-unknownChr]
}
# Consistency checks for all chromosomes from the user regions
for (chrCur in definedChr) {
# Check if defined in BAM header
if (!chrCur %in% names(chrSizes)) {
stop("Chromosome ", chrCur, "(from the user-defined regions) not defined in BAM file ", signalFileCur)
}
# Check if we have the chromosome length available in the chrSizes.df object
if (!chrCur %in% rownames(chrSizes.df)) {
stop("Could not find ", chrCur," in automatically retrieved list of chromosome lengths. Only chromosomes ", paste0(rownames(chrSizes.df), collapse = ","), " are available. Please remove all references to chromosome ", chrCur, " in the BAM file.", sep = "")
}
# Check if size is identical to what has been automatically retrieved
if (chrSizes[which(names(chrSizes) == chrCur)] != chrSizes.df[chrCur,]$size) {
stop("A mismatch was detected for the chromosome size of ", chrCur," (BAM file: ", chrSizes[which(names(chrSizes) == chrCur)] , ", specified genome assembly: ", chrSizes.df[chrCur,]$size,"). ", sep = "")
}
}
# Read groups
readGroupSpecificityTmp = FALSE
readGroups = "allReadGroups"
if (par.l$readGroupSpecific) readGroupSpecificityTmp = TRUE
if (readGroupSpecificityTmp) {
indexes = which(grepl("@RG", names(res[[signalFileCur]]$text)))
readGroups = as.character(sapply(res[[signalFileCur]]$text[indexes], function(x) {gsub("ID:","",x[[1]])}))
if (verbose & testNull(expectedReadGroups)) message(" Read group specific reporting has been requested. The following read groups have been identified in the BAM header: ", paste(readGroups, collapse = ", " ))
# If we don't find at least two read groups there, set readGroupSpecificity to FALSE
# Check if the same set of read groups is defined in the files
if (!testNull(expectedReadGroups)) {
if (!identical(sort(readGroups), sort(expectedReadGroups))) {
stop("The read groups as defined in the file ", signalFileCur, " are different from the ones in previous files. Expected ", paste0(expectedReadGroups, collapse = ","),", found: ", paste0(readGroups, collapse = ","))
}
}
if (length(unique(readGroups)) < 2) {
readGroupSpecificityTmp = FALSE
readGroups = "allReadGroups"
warning("Although it was requested to derive read count statistics read group-specifically, the BAM header does not contain at least two different read group definitions. Consequently, read group-specific reporting has been disabled. ")
}
}
list("readGroupSpecific" = readGroupSpecificityTmp, "readGroups" = readGroups)
}
#' @import checkmate
#' @importFrom Biostrings nucleotideFrequencyAt subseq
#' @import GenomicRanges
# @importFrom GenomicRanges strand start
.extractAndNormalize <- function(par.l,
annotationRegions,
readGroups,
SNPPos.gr,
files.vec,
individualName,
bins.l,
type,
verbose = TRUE) {
# TODO: Potential optimization: Save results of individual files so they don't have to be parsed multiple times
assertVector(files.vec, min.len = 1)
assertChoice(type, c("signal", "input"))
assertFlag(verbose)
if (type == "input") {
avgBackground.l = list()
avgBackgroundOld.l = list()
}
nRegionsCur = length(annotationRegions)
readCountsRegions.l = list()
readCountsBins.l = list()
# Init final result list
results.l = list(globalBackground = NULL,
genotypes = NULL,
readStartPos = NULL,
readWidth = NULL,
readCountsBinned = NULL,
readCountsUnbinned = NULL
)
# Can be ignored for input
if (type == "signal") {
for (alleleCur in readGroups) {
results.l$genotypes[[alleleCur]] = array(data = rep(0,4), dim = c(4, nRegionsCur), dimnames = list(c("A", "C", "G", "T")))
if (length(readGroups) > 1) {
results.l$readStartPos[[alleleCur]] = vector("list", nRegionsCur)
results.l$readWidth [[alleleCur]] = vector("list", nRegionsCur)
} else {
results.l$readStartPos[[alleleCur]] = NULL
results.l$readWidth [[alleleCur]] = NULL
}
}
}
nFilesProcessed = 0
for (fileCur in files.vec) {
assertCharacter(fileCur, any.missing = FALSE)
assertFileExists(fileCur, access = "r")
nFilesProcessed = nFilesProcessed + 1
if (verbose) message("\nPROCESS FILE ", nFilesProcessed, " of ", length(files.vec), ":", fileCur)
.checkAndCreateIndexFile(fileCur)
##################
# Various checks #
##################
expectedReadGroups = readGroups
# Only check read group consistency among signal files
if (type == "input") expectedReadGroups = NULL
res.l = .BAMHeaderConsistencyChecks(par.l, annotationRegions, fileCur, expectedReadGroups, verbose = TRUE)
################
# Get genotype #
################
if (type == "signal") {
start.timeTemp <- Sys.time()
if (!par.l$onlyPrepareForDatasetCorrelation) {
if (verbose) message(" Extract data from BAM file ", fileCur," for the SNPs only (this may take a while)...")
bamSignalSNP = .extractFromBAM(SNPPos.gr, fileCur, .getFieldsForBAMParsing("SNPs"), par.l, verbose)
.callCalculateOverlapsReads <- function(x, bamSignal_filteredCurSNP, SNPPos.gr) {
suppressWarnings(
nucleotideFrequencyAt(
subseq(bamSignal_filteredCurSNP[[x]]$seq, start(SNPPos.gr)[x] - bamSignal_filteredCurSNP[[x]]$pos + 1, width = 1), 1)
)
}
if (verbose) message(" Determine genotype distribution at original user positions for A, C, G, and T...")
for (alleleCur in readGroups) {
if (verbose) message(" Read group ",alleleCur)
res.l = .filterReads(bamSignalSNP, as.character(strand(SNPPos.gr)), alleleCur, par.l$strand)
bamSignal_filteredCurSNP = res.l$bamObj
# Suppress warnings so that N bases are simply not counted but don't confuse the user
readDist.l = .execInParallelGen(nCores = par.l$nCores,
returnAsList = TRUE,
listNames = NULL,
iteration = seq_len(length(bamSignal_filteredCurSNP)),
abortIfErrorParallel = TRUE,
verbose = FALSE,
functionName = .callCalculateOverlapsReads, bamSignal_filteredCurSNP, SNPPos.gr)
results.l$genotypes[[alleleCur]] = results.l$genotypes[[alleleCur]] + unlist(readDist.l, use.names = FALSE)
}
}
.printExecutionTime(start.timeTemp, verbose)
}
#################
# Extract reads #
#################
if (verbose) message(" Extract data from BAM file ", fileCur," for the full SNP regions (this may take a while)...")
bamFile = .extractFromBAM(annotationRegions, fileCur, .getFieldsForBAMParsing("regions"), par.l, verbose)
# Determine the read numbers per region before filtering by strand
nReads.vec = unlist(lapply(bamFile, function(x) length((x$pos))), use.names = FALSE)
nReadsSumAll = sum(nReads.vec)
if (nReadsSumAll == 0) warning(" No reads were found in file ", fileCur," that overlap the user regions. This may also happen if the values of the various readFlag_* parameters have been specified too strictly or incorrectly. For example, if you have single-end data, make sure you specified this when generating the default parameters list.")
#############################################################################
# Filter reads and evaluate read counts per bin specifically per read group #
#############################################################################
if (verbose & par.l$readGroupSpecific & type == "signal") message(" Analyze read counts specifically for each read group (this may take a while)...")
if (verbose & !par.l$readGroupSpecific & type == "signal") message(" Analyze read counts (this may take a while)...")
for (alleleCur in readGroups) {
if (testNull(readCountsRegions.l[[alleleCur]])) {
readCountsRegions.l[[alleleCur]] = list()
readCountsBins.l [[alleleCur]] = list()
}
if (verbose & par.l$readGroupSpecific) message(" Read group ", alleleCur, "...")
res.l = .filterReads(bamFile, as.character(strand(annotationRegions)), alleleCur, par.l$strand)
nFilteredReadsSignal = res.l$filteredReads
bamFile_filteredCur = res.l$bamObj
if (verbose & nFilteredReadsSignal > 0) message(" Filtered ", nFilteredReadsSignal," reads out of ", nReadsSumAll," in file ", fileCur," because of strand and read group")
nReads.vec = unlist(lapply(bamFile_filteredCur, function(x) length((x$pos))), use.names = FALSE)
nReadsSum = sum(nReads.vec)
if (nReadsSum == 0) warning(" No reads were found in file ", fileCur," that overlap the user regions for the read group ", alleleCur,".")
# Determine the read numbers per region
readCountsRegions.l[[alleleCur]] [[fileCur]] = nReads.vec
if (par.l$onlyPrepareForDatasetCorrelation) next
if (type == "signal") {
# Save the start positions and pool them if necessary
# Only necessary if the number of read groups is at least 2
if (length(readGroups) > 1) {
iterationBam = seq_len(length(bamFile_filteredCur))
results.l$readStartPos[[alleleCur]] =
lapply(iterationBam, function(x) {c(results.l$readStartPos[[alleleCur]][[x]], bamFile_filteredCur[[x]]$pos)})
results.l$readWidth[[alleleCur]] =
lapply(iterationBam, function(x) {c(results.l$readWidth[[alleleCur]][[x]], bamFile_filteredCur[[x]]$qwidth)})
}
}
# Calculate read coverage per bin for the input and the current read group
if (verbose) message(" Analyze read counts per bin (this may take a while)...")
readCountsBins.l[[alleleCur]] [[fileCur]] = .calculateOverlapsBin(bamFile_filteredCur, par.l, bins.l, readCountsRegions.l[[alleleCur]] [[fileCur]], verbose)
####################################
# Calculate genome-wide background #
####################################
if (type == "input") {
avgBackground.l[[fileCur]] = .calculateGenomeWideBackground(fileCur, par.l, nReadsToCheck = 1000, verbose = TRUE)
# Compute the average number of counts per bin across all input files that belong to the particular individual
avgBackgroundOld.l[[fileCur]] = mean(readCountsBins.l[[alleleCur]] [[fileCur]])
if (verbose) message("Global average background across bins (old, just for comparison): ", avgBackgroundOld.l[[fileCur]])
}
rm(bamFile_filteredCur)
}
# Delete large object and explicitly call the garbage collector because bamSignal may be a large object
rm(bamFile)
} #end for all files
#########################################
# Normalize libraries and add up counts #
#########################################
if (nFilesProcessed > 1) {
if (verbose) message(" Normalize library sizes...")
# Determine the normalization factors independent of the read groups. For this, sum up the counts for all read groups
counts.m = matrix(0, ncol = nFilesProcessed, nrow = nRegionsCur)
for (alleleCur in readGroups) {
for (i in seq_len(length(files.vec))) {
counts.m[, i] = counts.m[, i] + readCountsRegions.l[[alleleCur]] [[files.vec[i]]]
}
}
stopifnot(all(counts.m >= 0))
colnames(counts.m) = files.vec
res.l = .scaleLibraries(counts.m)
sizeFactors.vec = res.l$sizeFactors
for (fileCur in files.vec) {
stopifnot(!testNull(sizeFactors.vec[fileCur]))
for (alleleCur in readGroups) {
readCountsRegions.l[[alleleCur]] [[fileCur]] = readCountsRegions.l[[alleleCur]] [[fileCur]] / sizeFactors.vec[fileCur]
if (par.l$onlyPrepareForDatasetCorrelation) next
readCountsBins.l [[alleleCur]] [[fileCur]] = readCountsBins.l [[alleleCur]] [[fileCur]] / sizeFactors.vec[fileCur]
if (type == "input") {
avgBackground.l[[fileCur]] = avgBackground.l[[fileCur]] / sizeFactors.vec[fileCur]
avgBackgroundOld.l[[fileCur]] = avgBackgroundOld.l[[fileCur]] / sizeFactors.vec[fileCur]
}
}
}
} # end if (nFilesProcessed > 1)
#####################
# Adjust background #
#####################
if (type == "input") {
# Compute the mean average background.
avgBackgroundAll = mean(unlist(avgBackground.l, use.names = FALSE))
#avgBackgroundOldAll = mean(unlist(avgBackgroundOld.l))
results.l$globalBackground = avgBackgroundAll
}
################################
# Sum up the normalized counts #
################################
for (alleleCur in readGroups) {
# Round to 0 places after the comma to allow potential subsequent normalization
results.l$readCountsUnbinned[[alleleCur]] = round(Reduce("+", readCountsRegions.l[[alleleCur]]), 0)
if (par.l$onlyPrepareForDatasetCorrelation) next
results.l$readCountsBinned[[alleleCur]] = round(Reduce("+", readCountsBins.l [[alleleCur]]), 0)
}
results.l
}
# Important: Append the size factors because both signal and input may be used multiple times with other signals and input files, respectively, each of which translates to a different size factor
# Change the name so that it can be traced back to the corresponding signal data file
# Switch so that the mapping is composed of a signal and the corresponding input datafile always
# Save the size factor. Reverse the names here to make the mapping properly. This is correct and verified!
#names(sizeFactors.vec) = rev(names(sizeFactors.vec))
#SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[individualCur]] , sizeFactors.vec[inputFileSetCur])
#SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]] = c(SNPhood.o@internal$sizeFactors[[alleleCur]] [[inputFileSetCur]], sizeFactors.vec[individualCur])
.determineHeterozygosity <- function (SNPhood.o,
readGroupsToTest = annotationReadGroups(SNPhood.o),
takeOnlyMostAbundantOnePerReadGroup = TRUE,
verbose = TRUE) {
assertFlag(verbose)
assertSubset(readGroupsToTest, annotationReadGroups(SNPhood.o))
assertFlag(takeOnlyMostAbundantOnePerReadGroup)
heterozygous.m = matrix(NA, ncol = nDatasets(SNPhood.o) + 1, nrow = nRegions(SNPhood.o))
dimnames(heterozygous.m) = list(annotationRegions(SNPhood.o), c(annotationDatasets(SNPhood.o),"shared"))
for (i in 1:nDatasets(SNPhood.o)) {
datasetCur = annotationDatasets(SNPhood.o)[i]
for (j in seq_len(nRegions(SNPhood.o))) {
genotypeDist.vec = c()
isNA = FALSE
for (readGroupCur in readGroupsToTest) {
distCur = SNPhood.o@annotation$genotype$readsDerived[[readGroupCur]][[datasetCur]][,j]
indexGenotypesPresent = which(distCur > 0)
if (length(indexGenotypesPresent) > 0) {
genotypesPresent = sort(indexGenotypesPresent, decreasing = TRUE)
if (takeOnlyMostAbundantOnePerReadGroup) {
genotypeDist.vec = c(genotypeDist.vec, names(genotypesPresent)[1])
} else {
genotypeDist.vec = c(genotypeDist.vec, names(genotypesPresent))
}
} else {
heterozygous.m[j, i] = NA
isNA = TRUE
break
}
}
# Take most abundant genotype in each read group and check their identity
indexBasePairs = which(genotypeDist.vec %in% c("A", "C", "G", "T"))
if (!isNA) {
if (length(indexBasePairs) > 1) {
heterozygous.m[j, i] = TRUE
} else {
heterozygous.m[j, i] = FALSE
}
}
}
}
heterozygous.m[,ncol(heterozygous.m)] = rowSums(heterozygous.m, na.rm = TRUE) == (ncol(heterozygous.m) - 1)
SNPhood.o@annotation$genotype$heterozygosity = heterozygous.m
SNPhood.o
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.