#' Select the observations based on the average of a factor
#'
#' The negative binomial average of the \code{Count} variable is calculated for
#' each level of \code{variable}.
#' Only the levels which are equal or larger than \code{threshold} times the
#' maximal average (in the original scale) are retained.
#' @param observation the \code{data.frame} with observations
#' @param variable the name of the \code{factor}
#' @param threshold the minimal threshold
#' @export
#' @importFrom MASS glm.nb
#' @importFrom n2khelper check_single_probability check_dataframe_variable
#' @importFrom assertthat assert_that is.string has_name
#' @importFrom stats coef
#' @examples
#' observation <- data.frame(
#' Count = c(100, 101, 50, 51, 1, 0, 0, 0),
#' LocationID = factor(rep(1:4, each = 2))
#' )
#' select_factor_threshold(observation, "LocationID", threshold = 0.05)
select_factor_threshold <- function(observation, variable, threshold) {
assert_that(is.string(variable))
threshold <- check_single_probability(x = threshold, name = "threshold")
assert_that(inherits(observation, "data.frame"))
assert_that(has_name(observation, "Count"))
assert_that(has_name(observation, variable))
variable_factor <- observation[[variable]]
if (!is.factor(variable_factor)) {
variable_factor <- factor(variable_factor)
}
if (nrow(observation) < 2 * length(levels(variable_factor))) {
stop(
"The number of observations much be at least twice the number of levels in ",
variable
)
}
model <- glm.nb(observation$Count ~ 0 + variable_factor)
log_threshold <- max(coef(model)) + log(threshold)
selection <- levels(variable_factor)[coef(model) >= log_threshold]
observation <- observation[variable_factor %in% selection, ]
return(observation)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.