Nothing
#' Parasite prevalence estimation and confidence intervals
#'
#' Estimates parasite prevalence and corresponding confidence intervals from parasite abundance data, optionally stratified by grouping variables. Two types of confidence intervals are provided: exact binomial intervals and Blaker intervals, allowing robust inference across a wide range of sample sizes and prevalence values.
#'
#' Prevalence is defined as the proportion of hosts infected with a given parasite taxon:
#' \deqn{P = \frac{nH_{inf}}{nH}}
#' where:
#'
#' \itemize{
#' \item \eqn{nH} is the number of hosts analyzed (non-missing observations)
#' \item \eqn{nH_{inf}} nHinf is the number of infected hosts (abundance > 0)
#' }
#' The function reshapes the dataset into long format and computes prevalence for each parasite taxon and grouping combination (if specified).
#' Two types of confidence intervals are calculated:
#'
#' \itemize{
#' \item \strong{Exact (Clopper–Pearson) interval}: This is an exact binomial confidence interval, conservative but valid for all sample sizes, especially small samples or extreme prevalence values.
#' \item \strong{Blaker interval}: This interval is also exact but less conservative than Clopper–Pearson, providing shorter intervals while maintaining correct coverage.
#' }
#' Statistical considerations:
#'
#' \itemize{
#' \item Prevalence is a binomial proportion and can be estimated even for small sample sizes.
#' \item However, when sample size is very small (e.g., \eqn{nH=1}), the estimate lacks precision and confidence intervals become uninformative.
#' \item When no infected hosts are observed (\eqn{nH_{inf}=0}), prevalence is 0, and confidence intervals reflect uncertainty around zero.
#' }
#' The interpretation of results, particularly under small sample sizes, remains the responsibility of the user.
#' @usage
#' para_prevalence_CI(dataset, sp_cols, group_vars = NULL, decimal_places = 2,
#' conf_level = 0.95, output_type = "proportion", combine_ci = FALSE, verbose = FALSE)
#'
#' @param dataset Data frame with parasitological data.
#' @param sp_cols Vector with the names of the columns containing abundance of parasites (taxa) to calculate the parasitological descriptors.
#' @param group_vars Vector with the names of categorical variables used to define groups (e.g., "Sex", "Site"). Default is \code{NULL}.
#' @param decimal_places Number of decimal places to round the values. Default is \code{2}.
#' @param conf_level Confidence level for interval estimation (e.g., \code{0.95} for 95\% confidence intervals).
#' @param output_type Format of the result: either \code{"proportion"} or \code{"percentage"}. Default is \code{"proportion"}.
#' @param combine_ci Logical. If \code{TRUE}, the interval is expressed as a single column (min - max). If \code{FALSE}, the interval is split into separate lower and upper limit columns.
#' @param verbose A logical value indicating if progress messages should be given. Default = \code{FALSE}
#'
#' @return A data frame containing prevalence estimates and confidence intervals for each parasite taxon, either globally or by group. The following variables are returned:
#' \itemize{
#' \item \code{nH}: Number of hosts analyzed
#' \item \code{nH_inf}: Number of infested hosts
#' \item \code{prevalence}: Estimated prevalence
#' \item \code{Lower_exact}: Lower bound of the exact (Clopper--Pearson) interval
#' \item \code{Upper_exact}: Upper bound of the exact (Clopper--Pearson) interval
#' \item \code{Lower_blaker}: Lower bound of the Blaker interval
#' \item \code{Upper_blaker}: Upper bound of the Blaker interval
#' \item \code{Observation}: Categorical description of the data context:
#' \itemize{
#' \item \code{"Not analyzed"}: No valid observations are available for the given combination (all values are missing or the combination is absent in the dataset); therefore, no estimates can be computed.
#' \item \code{"One host analyzed"}: Only a single host analyzed is available for the given combination; thus, no population-level inference is possible and statistical summary measures are not estimated.
#' \item \code{"No hosts infested"}: Hosts are present for the given combination, but none are infested (abundance = 0 for all observations); consequently, no statistical summary measures of abundance or intensity can be estimated.
#' \item \code{"One host infested"}: Only a single infested host is recorded for the given combination; therefore, no sample-based estimation of intensity or related summary measures is possible.
#' \item \code{"Multiple hosts infested"}: More than one infested host is recorded for the given combination, allowing the estimation of summary measures.
#' }
#' }
#'
#' @examples
#'prevalence_CI <- para_prevalence_CI(para_data$dataset,
#' sp_cols = c("Sp1"),
#' group_vars = c("Site"),
#' decimal_places = 2,
#' conf_level = 0.95,
#' output_type = "proportion",
#' combine_ci = TRUE,
#' verbose = TRUE)
#'
#'prevalence_CI
#'
#'
#' @references
#' Bush, A.O., Lafferty, K.D., Lotz, J.M., Shostak, A.W. (1997). Parasitology meets ecology on its own terms:
#' Margolis revisited. \emph{Journal of Parasitology}, 83(4), 575–583.
#'
#' Reiczigel, J., Marozzi, M., Fabian, I., Rózsa, L. (2019). Biostatistics for parasitologists – a primer to
#' quantitative parasitology. \emph{Trends in Parasitology}, 35(4), 277–281.
#'
#' @author Juan Manuel Cabrera, Exequiel Furlan and Elisa Helman
#'
#' @export
para_prevalence_CI <- function(dataset, sp_cols, group_vars = NULL, decimal_places = 2,
conf_level = 0.95, output_type = "proportion", combine_ci = FALSE, verbose = FALSE) {
Abund<-NA
data_long<-NA
Sp<-NA
nH<-NA
prevalence<-NA
nH_inf<-NA
Lower_blaker<-NA
Upper_blaker<-NA
Observation<-NA
Lower_exact<-NA
Upper_exact<-NA
CI_exact<-NA
CI_blaker<-NA
if (verbose) message("Checking arguments...")
# Validaciones
if (is.null(sp_cols) || length(sp_cols) == 0) {
stop("The species columns must be specified (sp_cols).")
}
if (!all(sp_cols %in% colnames(dataset))) {
stop("Some species columns do not exist in the dataset.")
}
if (!is.null(group_vars) && !all(group_vars %in% colnames(dataset))) {
stop("Some grouping variables do not exist in the dataset.")
}
# Formato largo
data_long <- dataset %>%
tidyr::pivot_longer(
cols = dplyr::all_of(sp_cols),
names_to = "Sp",
values_to = "Abund"
) %>%
dplyr::mutate(Abund = as.numeric(as.character(Abund)))
if (verbose) message("Calculating prevalence...")
group_by_vars <- c(group_vars, "Sp")
# Cálculo base
prev_results <- data_long %>%
dplyr::group_by(dplyr::across(dplyr::all_of(group_by_vars))) %>%
dplyr::summarise(
nH = sum(!is.na(Abund)),
nH_inf = sum(Abund > 0, na.rm = TRUE),
.groups = "drop"
) %>%
dplyr::mutate(
prevalence = ifelse(nH > 1, nH_inf / nH, NA_real_)
)
# Intervalos de confianza
prev_results <- prev_results %>%
dplyr::rowwise() %>%
dplyr::mutate(
Lower_exact = ifelse(nH > 1,
stats::binom.test(nH_inf, nH, conf.level = conf_level)$conf.int[1],
NA_real_),
Upper_exact = ifelse(nH > 1,
stats::binom.test(nH_inf, nH, conf.level = conf_level)$conf.int[2],
NA_real_),
Lower_blaker = ifelse(nH > 1,
BlakerCI::binom.blaker.limits(nH_inf, nH, level = conf_level)[1],
NA_real_),
Upper_blaker = ifelse(nH > 1,
BlakerCI::binom.blaker.limits(nH_inf, nH, level = conf_level)[2],
NA_real_)
) %>%
dplyr::ungroup()
# Redondeo
prev_results <- prev_results %>%
dplyr::mutate(
prevalence = round(prevalence, decimal_places),
Lower_exact = round(Lower_exact, decimal_places),
Upper_exact = round(Upper_exact, decimal_places),
Lower_blaker = round(Lower_blaker, decimal_places),
Upper_blaker = round(Upper_blaker, decimal_places)
)
# Observaciones
prev_results <- prev_results %>%
dplyr::mutate(
Observation = dplyr::case_when(
nH == 0 ~ "Not analyzed",
nH == 1 ~ "One host analyzed",
nH_inf == 0 ~ "No hosts infested",
nH_inf == 1 ~ "One host infested",
TRUE ~ "Multiple hosts infested"
)
)
# Convertir a porcentaje
if (output_type == "percentage") {
prev_results <- prev_results %>%
dplyr::mutate(
prevalence = prevalence * 100,
Lower_exact = Lower_exact * 100,
Upper_exact = Upper_exact * 100,
Lower_blaker = Lower_blaker * 100,
Upper_blaker = Upper_blaker * 100
)
}
# WARNING: IC no estimables
failed_ci <- any(is.na(prev_results$Lower_exact) | is.na(prev_results$Upper_exact))
if (failed_ci) {
warning("Confidence intervals could not be estimated for some combinations due to insufficient sample size.")
}
# WARNING: IC poco confiables
unreliable_ci <- any(prev_results$nH <= 5 & prev_results$nH > 1)
if (unreliable_ci) {
warning("Some confidence intervals may be unreliable due to small sample size.")
}
# Combinar CI
if (combine_ci) {
prev_results <- prev_results %>%
dplyr::mutate(
CI_exact = ifelse(nH > 1,
paste(Lower_exact, Upper_exact, sep = " - "),
NA_character_),
CI_blaker = ifelse(nH > 1,
paste(Lower_blaker, Upper_blaker, sep = " - "),
NA_character_)
) %>%
dplyr::select(
dplyr::all_of(group_vars),
Sp,
nH,
nH_inf,
prevalence,
CI_exact,
CI_blaker,
Observation
)
} else {
prev_results <- prev_results %>%
dplyr::select(
dplyr::all_of(group_vars),
Sp,
nH,
nH_inf,
prevalence,
Lower_exact,
Upper_exact,
Lower_blaker,
Upper_blaker,
Observation
)
}
if (verbose) message("Calculation completed")
return(prev_results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.