#' @title Statistical Evaluation of a given Pairwise Alignment
#' @description This function allows to quantify the statistical significance of a given pairwise alignment
#' between a query and subject sequence based on a sampled score distribution returned by \code{\link{randSeqDistr}}.
#' @param seq a character vector storing a sequence as string for which random sequences shall be computed.
#' @param subject a character vector storing a subject sequence as string to which \code{seq} shall be pairwise aligned.
#' @param sample_size a numeric value specifying the number of random sequences that shall be returned.
#' @param FUN a pairwise alignment function such as \code{\link[Biostrings]{pairwiseAlignment}} or any other function
#' that takes sequence arguments as first and second input.
#' @param ... additional arguments that shall be passed to \code{FUN}.
#' @param fit_distr a character string specifying the probability distribution that shall be fitted to the histogram
#' of scores returned by \code{\link{randSeqDistr}}. See \code{\link[fitdistrplus]{fitdist}} when method = "mme" for details.
#' A special case is \code{fit_distr} = \code{"simple"}. This way simply the relative frequency of random scores that are greater than
#' the real alignment score is returned as p-value.
#' @param gof a logical value specifying whether or not godness of fit measures shall be printed to the console.
#' @param comp_cores a numeric value specifying the number of cores you want to use for multicore processing.
#' @details
#' The test statistic is developed using moment matching estimation of a given probability distribution that
#' is fitted to the alignment score vector returned by \code{\link{randSeqDistr}}. The corresponding distribution
#' parameters are estimated by the \code{\link[fitdistrplus]{fitdist}} and the p-value quantifying the statistical
#' significance of the pairwise alignment of the input sequences is returned.
#'
#' The following distributions can be fitted to the alignment score distribution:
#'
#' \itemize{
#' \item \code{\link{pnorm}}
#' \item \code{\link{plnorm}}
#' \item \code{\link{ppois}}
#' \item \code{\link{pexp}}
#' \item \code{\link{pgamma}}
#' \item \code{\link{pnbinom}}
#' \item \code{\link{pgeom}}
#' \item \code{\link{pbeta}}
#' }
#'
#' A special case is \code{fit_distr} = \code{"simple"}. This way simply the relative frequency of random scores that are greater than
#' the real alignment score is returned as p-value.
#'
#' @author Hajk-Georg Drost
#' @return a p-value quantifying the statistical significance of the pairwise alignment of the input sequences.
#' @examples
#' seq_example <- "MEDQVGFGF"
#' subject_example <- "AYAIDPTPAF"
#' # evaluate alignment
#' p_val_align <- evalAlignment(seq_example, subject_example, 10,
#' Biostrings::pairwiseAlignment,
#' scoreOnly=TRUE, fit_distr = "norm",
#' comp_cores = 1)
#' @seealso \code{\link{randSeqDistr}}, \code{\link{randomSeqs}}, \code{\link[fitdistrplus]{fitdist}}
#' @export
evalAlignment <-
function(seq,
subject,
sample_size,
FUN,
... ,
fit_distr = "norm",
gof = FALSE,
comp_cores = 1) {
if (!is.element(
fit_distr,
c(
"norm",
"lnorm",
"pois",
"exp",
"gamma",
"nbinom",
"geom",
"beta",
"simple"
)
))
stop(
"Please choose a distribution to fit your alignment scores that is supported by this function.", call. =
)
align.fun <- match.fun(FUN)
# random sequence vector
r_seqs <-
vector(mode = "character", length = sample_size)
# alignment score vector of random seqs
scores <-
vector(mode = "numeric", length = sample_size)
p_val <- vector(mode = "numeric", length = 1)
real_score <- vector(mode = "numeric", length = 1)
# determine the score of the real pairwise alignment
real_score <- align.fun(seq, subject, ...)
# determine random scores
r_seqs <-
randomSeqs(seq = seq, sample_size = sample_size)
scores <- randSeqDistr(
seq = seq,
subject = subject,
sampled_strings = r_seqs,
FUN = align.fun,
...,
comp_cores = comp_cores
)
if (fit_distr == "norm") {
# paramter estimates of the normal distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pnorm(
real_score,
mean = est$estimate[1],
sd = est$estimate[2],
lower.tail = FALSE
)
}
if (fit_distr == "gamma") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
# paramter estimates of the gamma distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pgamma(
real_score,
shape = est$estimate[1],
rate = est$estimate[2],
lower.tail = FALSE
)
}
if (fit_distr == "lnorm") {
# paramter estimates of the log-normal distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::plnorm(
real_score,
meanlog = est$estimate[1],
sdlog = est$estimate[2],
lower.tail = FALSE
)
}
if (fit_distr == "pois") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
# paramter estimates of the poisson distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::ppois(real_score,
lambda = est$estimate[1],
lower.tail = FALSE)
}
if (fit_distr == "exp") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
# paramter estimates of the exponential distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pexp(real_score,
rate = est$estimate[1],
lower.tail = FALSE)
}
if (fit_distr == "nbinom") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
# paramter estimates of the negative binomial distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pnbinom(
real_score,
size = est$estimate[1],
mu = est$estimate[2],
lower.tail = FALSE
)
}
if (fit_distr == "geom") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
if (!is.integer(real_score))
stop("Please specify a discrete (integer) input vector.", call. = FALSE)
# paramter estimates of the geometric distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pgeom(real_score,
prob = est$estimate[1],
lower.tail = FALSE)
}
if (fit_distr == "beta") {
if (range(scores)[1] < 0)
stop(
"Please choose a distribution to fit your alignment scores that supports negative values.", call. = FALSE
)
# paramter estimates of the beta distribution
# estimated by moment matching
est <-
fitdistrplus::fitdist(scores, distr = fit_distr, method = "mme")
if (gof) {
message(fitdistrplus::gofstat(est))
cat("\n")
}
p_val <-
stats::pbeta(
real_score,
shape1 = est$estimate[1],
shape2 = est$estimate[2],
lower.tail = FALSE
)
}
if (fit_distr == "simple") {
p_val <- sum(scores > real_score) / length(scores)
}
names(p_val) <- "p.value"
return(p_val)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.