#' Auto detection of a fitted `pcKeepComp` param for filterFFT function
#'
#' This function tries to obtain the minimum number of components needed in a
#' FFT filter to achieve or get as close as possible to a given correlation
#' value. Usually you don't need to call directly this function, is used in
#' `filterFFT` by default.
#'
#' This function predicts a suitable `pcKeepComp` value for `filterFFT`
#' function. This is the recommended amount of components (in percentage) to
#' keep in the `filterFFT` function to obtain a correlation of (or near of)
#' `cor.target`.
#'
#' The search starts from two given values `pc.min`, `pc.max` and uses linial
#' interpolation to quickly reach a value that gives a corelation between the
#' filtered and the original near `cor.target` within the specified tolerance
#' `cor.tol`.
#'
#' To allow a quick detection without an exhaustive search, this function uses
#' a subset of the data by randomly sampling those regions with meaningful
#' coverage values (i,e, different from 0 or NA) larger than `smpl.min.size`.
#' If it's not possible to obtain `smpl.max.size` from this region (this could
#' be due to flanking 0's, for example) at least `smpl.min.size` will be used
#' to check correlation. Mean correlation between all sampled regions is used
#' to test the performance of the `pcKeepComp` parameter.
#'
#' If the number of meaningful bases in `data` is less than `smpl.min.size *
#' (smpl.num/2)` all the `data` vector will be used instead of using sampling.
#'
#' @param data Numeric vector to be filtered
#' @param pc.min,pc.max Range of allowed values for `pcKeepComp` (minimum and
#' maximum), in the range 0:1.
#' @param max.iter Maximum number of iterations
#' @param verbose Extra information (debug)
#' @param cor.target Target correlation between the filtered and the original
#' profiles. A value around 0.99 is recommeded for Next Generation Sequencing
#' data and around 0.7 for Tiling Arrays.
#' @param cor.tol Tolerance allowed between the obtained correlation an the
#' target one.
#' @param smpl.num If `data` is a large vector, some samples from the vector
#' will be used instead the whole dataset. This parameters tells the number
#' of samples to pick.
#' @param smpl.min.size,smpl.max.size Minimum and maximum size of the samples.
#' This is used for selection and sub-selection of ranges with meaningful
#' values (i,e, different from 0 and NA). Power of 2 values are recommended,
#' despite non-mandatory.
#' @param \dots Parameters to be pass to `autoPcKeepComp`
#'
#' @return Fitted `pcKeepComp` value
#'
#' @author Oscar Flores \email{oflores@@mmb.pcb.ub.es}, David Rosell
#' \email{david.rosell@@irbbarcelona.org}
#' @keywords attribute
#'
#' @examples
#' # Load dataset
#' data(nucleosome_htseq)
#' data <- as.vector(coverage.rpm(nucleosome_htseq)[[1]])
#'
#' # Get recommended pcKeepComp value
#' pckeepcomp <- pcKeepCompDetect(data, cor.target=0.99)
#' print(pckeepcomp)
#'
#' # Call filterFFT
#' f1 <- filterFFT(data, pcKeepComp=pckeepcomp)
#'
#' # Also this can be called directly
#' f2 <- filterFFT(data, pcKeepComp="auto", cor.target=0.99)
#'
#' # Plot
#' library(ggplot2)
#' i <- 1:2000
#' plot_data <- rbind(
#' data.frame(x=i, y=data[i], coverage="original"),
#' data.frame(x=i, y=f1[i], coverage="two calls"),
#' data.frame(x=i, y=f2[i], coverage="one call")
#' )
#' qplot(x=x, y=y, color=coverage, data=plot_data, geom="line",
#' xlab="position", ylab="coverage")
#'
#' @export pcKeepCompDetect
#'
#' @importFrom stats runif
#' @importFrom IRanges IRanges
#' @importFrom stats cor
#' @importMethodsFrom BiocGenerics width start
pcKeepCompDetect <- function (data, pc.min = 0.01, pc.max = 0.1, max.iter = 20,
verbose = FALSE, cor.target = 0.98, cor.tol = 1e-3,
smpl.num = 25, smpl.min.size = 2 ^ 10,
smpl.max.size = 2 ^ 14)
{
# Sample data
samp <- .sampleData(
data,
smpl.num,
smpl.min.size,
smpl.max.size,
verbose=verbose
)
# Original params
pc.min.input <- pc.min
pc.max.input <- pc.max
# Calculates the correlation between fft and normal for a given pcKeepComp
.meancor <- function(pc) {
if (pc == 1) {
return(1)
}
if (pc == 0) {
return(0)
}
cors <- sapply(samp, function(x) cor(filterFFT(x, pcKeepComp=pc), x))
return(mean(cors, na.rm=TRUE))
}
# Initial evaluation
eval.min <- .meancor(pc.min)
eval.max <- .meancor(pc.max)
desv.min <- eval.min - cor.target
desv.max <- eval.max - cor.target
if (abs(desv.min) < abs(desv.max)) {
pc.best <- pc.min
dv.best <- desv.min
} else {
pc.best <- pc.max
dv.best <- desv.max
}
# Iterate
i <- 0
while (
(abs(dv.best) > cor.tol) &
(i < max.iter) &
(pc.best > pc.min.input) &
(pc.best < pc.max.input)
) {
i <- i + 1
if (verbose) {
cat(
"Iteration",
i,
". pcKeepComp=",
pc.best,
". Correlation=",
dv.best + cor.target,
"\n"
)
}
# Lineal interpolation of optimal value
pc.new <- pc.min -
(desv.min * ((pc.max - pc.min) / (desv.max - desv.min)))
if (pc.new > 1) {
pc.new <- 1
} else if (pc.new < 0) {
pc.new <- 0
}
#New evaluation
eval.new <- .meancor(pc.new)
desv.new <- eval.new - cor.target
#Update limits
if (desv.min < 0 & desv.max > 0) {
if (desv.new > 0) {
pc.max <- pc.new
desv.max <- desv.new
} else {
pc.min <- pc.new
desv.min <- desv.new
}
} else if (desv.min > 0 & desv.max > 0) {
pc.max <- pc.min
desv.max <- desv.min
pc.min <- pc.new
desv.min <- desv.new
} else if (desv.min < 0 & desv.max < 0) {
pc.min <- pc.max
desv.min <- desv.new
pc.max <- pc.new
desv.max <- desv.new
}
# New best value update
if (abs(desv.new) < abs(dv.best)) {
pc.best <- pc.new
dv.best <- desv.new
}
}
if (pc.best > pc.max.input) {
pc.best <- pc.max.input
}
if (pc.best < pc.min.input) {
pc.best <- pc.min.input
}
return(pc.best)
}
.sampleData <- function (data, smpl.num, smpl.min.size, smpl.max.size,
verbose = FALSE)
{
# Check coherency
if (smpl.min.size > smpl.max.size) {
warning("smpl.min.size > smpl.max.size, using only smpl.min.size")
smpl.max.size <- smpl.min.size
}
res <- list()
# For short sequences, use all the data
if (length(data) < smpl.min.size * (smpl.num / 2)) {
if(verbose) {
message("Short fragment. Using all data")
}
res[[1]] <- data
# For long sequence, use sampling
} else {
if(verbose) {
message("Long sequence. Trying sampling...")
}
# Select ranges <> from 0 or NA and longer than minimum size
rang <- IRanges(data != 0 & !is.na(data))
rang <- rang[width(rang) > smpl.min.size]
tota <- sum(width(rang))
# If the overall useful bases don't satisfy the criteria, return all
if (tota < smpl.min.size * (smpl.num / 2)) {
if (verbose) {
message(
"No enough covered bases to apply sampling. ",
"Using all data"
)
}
res[[1]] <- data
} else {
if(verbose) {
message(" Selecting regions for sampling")
}
# Sampling with weighted probabilities according range's width
reps <- sample(
1:length(rang),
size=smpl.num,
replace=TRUE,
prob=width(rang) / tota
)
# Random selection of start points
# In short, if a range has a width higher than smpl.max.size we
# will pickup a random start position inside it that still it's in
# the limits. The width of the final ranges will be the minimum of
# the range's size and smpl.max.size
wids <- width(rang[reps])
marg <- unlist(sapply(wids - smpl.max.size, max, 0))
rnof <- unlist(sapply(
marg[marg > 0],
function(x) floor(runif(n=1, max=x, min=1))
))
marg[marg > 0] <- rnof
fina <- IRanges(
start=start(rang[reps]) + marg,
width=sapply(wids, min, smpl.max.size)
)
res <- lapply(fina, function(x) data[x])
}
}
if (verbose) {
message("Returning ", length(res), " regions")
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.