# -------------------------------------------------------------------------------------------------
#' dsr_inner
#'
#' Generates dsrs. All arguments are passed from the calculate_dsr wrapper
#' function with the exception of those defined below
#'
#' @param rtn_nonindependent_vardsr bool, keep default (FALSE) for independent
#' events. For non-independent events, set to TRUE on the first iteration
#' through the function to output the variance to pass in to the second
#' iteration.
#' @param use_nonindependent_vardsr bool, keep default (FALSE) for independent
#' events. For non-independent events, pass TRUE to use the custom_vardsr
#' field for CI calculation in second run of function.
#'
#' @inheritParams calculate_dsr
#'
#' @return When rtn_nonindependent_vardsr = FALSE, returns a data frame of ID
#' variables plus the dsr value, confidence limit & metadata fields as
#' specified by the type argument. For non-independent events first iteration
#' with rtn_nonindependent_vardsr = TRUE, returns just the ID variables and
#' the dsr variance to be passed into the second iteration.
#'
#' @section Notes: This is an internal package function that is called by
#' calculate_dsr. It will run through once when events are independent. When
#' events are non-independent, it will run through twice, calculating the
#' variance for event frequencies on the first iteration and passing this
#' value in to the function on the second iteration.
#'
#' @noRd
#'
# -------------------------------------------------------------------------------------------------
dsr_inner <- function(data,
x,
n,
stdpop,
type,
confidence,
multiplier,
rtn_nonindependent_vardsr = FALSE,
use_nonindependent_vardsr = FALSE) {
# validate arguments specific to dsr_inner function
if (isTRUE(rtn_nonindependent_vardsr) &&
("custom_vardsr" %in% names(data) || isTRUE(use_nonindependent_vardsr))) {
stop("cannot get nonindependent vardsr and use nonindependent vardsr in the same execution")
}
# scale and extract confidence values
confidence[confidence >= 90] <- confidence[confidence >= 90] / 100
conf1 <- confidence[1]
conf2 <- confidence[2]
# create dummy custom_vardsr column when not in use to prevent errors evaluating vardsr code
if (!use_nonindependent_vardsr) {
method = "Dobson"
data <- data %>%
mutate(custom_vardsr = NA_real_)
} else {
method = "Dobson, with confidence adjusted for non-independent events"
}
# calculate DSR and vardsr
dsrs <- data %>%
mutate(
wt_rate = na.zero(.data$x) * .data$stdpop / .data$n,
sq_rate = na.zero(.data$x) * (.data$stdpop / (.data$n))^2,
) %>%
summarise(
total_count = sum(.data$x, na.rm = TRUE),
total_pop = sum(.data$n),
value = sum(.data$wt_rate) / sum(.data$stdpop) * multiplier,
vardsr = case_when(
isTRUE(use_nonindependent_vardsr) ~ unique(.data$custom_vardsr),
.default = 1 / sum(.data$stdpop)^2 * sum(.data$sq_rate)
),
.groups = "keep"
)
if (!rtn_nonindependent_vardsr) {
# Calculate CIs
dsrs <- dsrs |>
ungroup() |>
mutate(
lowercl = .data$value + sqrt(.data$vardsr / .data$total_count) *
(byars_lower(.data$total_count, conf1) - .data$total_count) *
multiplier,
uppercl = .data$value + sqrt(.data$vardsr / .data$total_count) *
(byars_upper(.data$total_count, conf1) - .data$total_count) *
multiplier,
lower99_8cl = .data$value + sqrt(.data$vardsr / .data$total_count) *
(byars_lower(.data$total_count, 0.998) - .data$total_count) *
multiplier,
upper99_8cl = .data$value + sqrt(.data$vardsr / .data$total_count) *
(byars_upper(.data$total_count, 0.998) - .data$total_count) *
multiplier
) %>%
mutate(
confidence = paste0(confidence * 100, "%", collapse = ", "),
statistic = paste("dsr per",format(multiplier, scientific = FALSE)),
method = method
)
# rename or drop confidence limits depending whether 1 or 2 CIs requested
if (!is.na(conf2)) {
names(dsrs)[names(dsrs) == "lowercl"] <- "lower95_0cl"
names(dsrs)[names(dsrs) == "uppercl"] <- "upper95_0cl"
} else {
dsrs <- dsrs %>%
select(!c("lower99_8cl", "upper99_8cl"))
}
# remove DSR calculation for total counts < 10
dsrs <- dsrs %>%
mutate(
across(c("value", starts_with("upper"), starts_with("lower")),
function(x) if_else(.data$total_count < 10, NA_real_, x)),
statistic = if_else(
.data$total_count < 10,
"dsr NA for total count < 10",
.data$statistic
)
)
}
# drop fields not required based on values of nonindependent_breakdowns and type arguments
if (rtn_nonindependent_vardsr) {
dsrs <- dsrs %>%
select(group_cols(), "vardsr")
} else if (type == "lower") {
dsrs <- dsrs %>%
select(!c("total_count", "total_pop", "value", starts_with("upper"),
"vardsr", "confidence", "statistic", "method"))
} else if (type == "upper") {
dsrs <- dsrs %>%
select(!c("total_count", "total_pop", "value", starts_with("lower"),
"vardsr", "confidence", "statistic", "method"))
} else if (type == "value") {
dsrs <- dsrs %>%
select(!c("total_count", "total_pop", starts_with("lower"), starts_with("upper"),
"vardsr", "confidence", "statistic", "method"))
} else if (type == "standard") {
dsrs <- dsrs %>%
select(!c("vardsr", "confidence", "statistic", "method"))
} else if (type == "full") {
dsrs <- dsrs %>%
select(!c("vardsr"))
}
return(dsrs)
}
# -------------------------------------------------------------------------------------------------
#' Calculate Directly Standardised Rates using calculate_dsr
#'
#' Calculates directly standardised rates with confidence limits using Byar's
#' method (1) with Dobson method adjustment (2) including option to further
#' adjust confidence limits for non-independent events (3).
#'
#' @param data data frame containing the data to be standardised, pre-grouped if
#' multiple DSRs required; unquoted string; no default
#' @param x field name from data containing the observed number of events for
#' each standardisation category (eg ageband) within each grouping set (eg
#' area); unquoted string; no default
#' @param n field name from data containing the populations for each
#' standardisation category (eg ageband) within each grouping set (eg area);
#' unquoted string; no default
#' @param stdpop field name from data containing the standard populations for
#' each age band; unquoted string; no default
#' @param type defines the data and metadata columns to be included in output;
#' can be "value", "lower", "upper", "standard" (for all data) or "full" (for
#' all data and metadata); quoted string; default = "full"
#' @param confidence the required level of confidence expressed as a number
#' between 0.9 and 1 or a number between 90 and 100 or can be a vector of 0.95
#' and 0.998, for example, to output both 95 percent and 99.8 percent percent
#' CIs; numeric; default 0.95
#' @param multiplier the multiplier used to express the final values (eg 100,000
#' = rate per 100,000); numeric; default 100,000
#' @param independent_events whether events are independent. Set to TRUE for
#' independent events. When set to FALSE an adjustment is made to the
#' confidence intervals - to do this, the dataset provided must include event
#' frequency breakdowns and column x is redefined as the number of unique
#' individuals who experienced each frequency of event, rather than the total
#' number of events.
#' @param eventfreq field name from data containing the event frequencies. Only
#' required when independent_events = FALSE; unquoted string; default NULL
#' @param ageband field name from data containing the age bands for
#' standardisation. Only required when independent_events = FALSE; unquoted
#' string; default NULL
#'
#' @return When type = "full", returns a tibble of total counts, total
#' populations, directly standardised rates, lower confidence limits, upper
#' confidence limits, confidence level, statistic and method for each grouping
#' set. Use the type argument to limit the columns output.
#'
#' @importFrom rlang sym quo_name
#' @import dplyr
#' @export
#'
#' @examples
#' library(dplyr)
#' df <- data.frame(
#' indicatorid = rep(c(1234, 5678, 91011, 121314), each = 19 * 2 * 5),
#' year = rep(2006:2010, each = 19 * 2),
#' sex = rep(rep(c("Male", "Female"), each = 19), 5),
#' ageband = rep(c(0,5,10,15,20,25,30,35,40,45,
#' 50,55,60,65,70,75,80,85,90), times = 10),
#' obs = sample(200, 19 * 2 * 5 * 4, replace = TRUE),
#' pop = sample(10000:20000, 19 * 2 * 5 * 4, replace = TRUE),
#' esp2013 = rep(esp2013, 40)
#' )
#'
#' ## Example 1 - Default execution
#' df %>%
#' group_by(indicatorid, year, sex) %>%
#' calculate_dsr(obs, pop, stdpop = esp2013)
#'
#' ## Example 2 - Calculate both 95% and 99.8% CIs in single execution
#' df %>%
#' group_by(indicatorid, year, sex) %>%
#' calculate_dsr(obs, pop, stdpop = esp2013, confidence = c(0.95, 0.998))
#'
#' ## Example 3 - Drop metadata columns from the output
#' df %>%
#' group_by(indicatorid, year, sex) %>%
#' calculate_dsr(obs, pop, stdpop = esp2013, type = "standard")
#'
#' ## Example 4 - Calculate DSRs for non-independent events
#'
#' library(tidyr)
#'
#' # For non-independent events the input data frame must breakdown events into
#' # counts of unique individuals by event frequency. The code chunk below
#' # creates a dummy data frame in this required format. Note that assignment of
#' # 10%, 20% and 70% of events to each event frequency is purely to create a
#' # data frame in the required format whilst retaining the same total event and
#' # population distributions by group and age band as example 1 to allow
#' # comparison of the outputs.
#'
#' df_freq <- df %>%
#' mutate(
#' f3 = floor((obs * 0.1)/3), # 10 % of events in individuals with 3 events
#' f2 = floor((obs * 0.2)/2), # 20 % of events in individuals with 2 events
#' f1 = (obs - (3 * f3) - (2 * f2)) # 70% of events in individuals with 1 event
#' ) %>%
#' select(!"obs") %>%
#' pivot_longer(
#' cols = c("f1", "f2", "f3"),
#' names_to = "eventfrequency",
#' values_to = "uniqueindividuals",
#' names_prefix = "f"
#' ) %>%
#' mutate(eventfrequency = as.integer(eventfrequency))
#'
#' # Calculate the dsrs - notice that output DSR values match those in
#' # example 1 but the confidence intervals are wider
#'
#' df_freq %>%
#' group_by(indicatorid, year, sex) %>%
#' calculate_dsr(
#' x = uniqueindividuals,
#' n = pop,
#' stdpop = esp2013,
#' independent_events = FALSE,
#' eventfreq = eventfrequency,
#' ageband = ageband
#' )
#'
#'
#' @section Notes: For total counts >= 10 Byar's method (1) is applied using
#' the internal byars_lower and byars_upper functions. When the total count is
#' < 10 DSRs are not reliable and will therefore be suppressed in the output.
#'
#' @references (1) Breslow NE, Day NE. Statistical methods in cancer research,
#' volume II: The design and analysis of cohort studies. Lyon: International
#' Agency for Research on Cancer, World Health Organisation; 1987. \cr \cr (2)
#' Dobson A et al. Confidence intervals for weighted sums of Poisson
#' parameters. Stat Med 1991;10:457-62. \cr \cr (3) See the DSR chapter of the
#' [Fingertips Public Health Technical Guidance](https://fingertips.phe.org.uk/static-reports/public-health-technical-guidance/)
#'
#'
#' @family PHEindicatormethods package functions
# -------------------------------------------------------------------------------------------------
calculate_dsr <- function(data,
x,
n,
stdpop = NULL,
type = "full",
confidence = 0.95,
multiplier = 100000,
independent_events = TRUE,
eventfreq = NULL,
ageband = NULL) {
# Validate arguments ---------------------------------------------------------
# check required arguments present
if (missing(data) | missing(x) | missing(n) | missing(stdpop)) {
stop("function calculate_dsr requires at least 4 arguments: data, x, n, stdpop")
}
# check columns exist in data frame
if (!is.data.frame(data)) {
stop("data must be a data frame object")
}
if (!deparse(substitute(x)) %in% colnames(data)) {
stop("x is not a field name from data")
}
if (!deparse(substitute(n)) %in% colnames(data)) {
stop("n is not a field name from data")
}
if (!deparse(substitute(stdpop)) %in% colnames(data)) {
stop("stdpop is not a field name from data")
}
# hard-code field names
data <- data %>%
rename(
x = {{ x }},
n = {{ n }},
stdpop = {{ stdpop }}
)
if (!is.numeric(data$x)) {
stop("field x must be numeric")
} else if (!is.numeric(data$n)) {
stop("field n must be numeric")
} else if (!is.numeric(data$stdpop)) {
stop("field stdpop must be numeric")
} else if (anyNA(data$n)) {
stop("field n cannot have missing values")
} else if (anyNA(data$stdpop)) {
stop("field stdpop cannot have missing values")
} else if (any(pull(data, x) < 0, na.rm = TRUE)) {
stop("numerators must all be greater than or equal to zero")
} else if (any(pull(data, n) <= 0)) {
stop("denominators must all be greater than zero")
} else if (any(pull(data, stdpop) < 0)) {
stop("stdpop must all be greater than or equal to zero")
} else if (!(type %in% c("value", "lower", "upper", "standard", "full"))) {
stop("type must be one of value, lower, upper, standard or full")
} else if (!is.numeric(confidence)) {
stop("confidence must be numeric")
} else if (length(confidence) > 2) {
stop("a maximum of two confidence levels can be provided")
} else if (length(confidence) == 2) {
if (!(confidence[1] == 0.95 & confidence[2] == 0.998)) {
stop("two confidence levels can only be produced if they are specified as 0.95 and 0.998")
}
} else if ((confidence < 0.9) | (confidence > 1 & confidence < 90) |
(confidence > 100)) {
stop("confidence level must be between 90 and 100 or between 0.9 and 1")
} else if (!is.numeric(multiplier)) {
stop("multiplier must be numeric")
} else if (multiplier <= 0) {
stop("multiplier must be greater than 0")
} else if (!rlang::is_bool(independent_events)) {
stop("independent_events must be TRUE or FALSE")
}
if (!independent_events) {
# check that eventfrequency & ageband columns are specified & exist
if (missing(eventfreq)) {
stop(paste0("function calculate_dsr requires an eventfreq column ",
"to be specified when independent_events is FALSE"))
} else if (!deparse(substitute(eventfreq)) %in% colnames(data)) {
stop("eventfreq is not a field name from data")
} else if (!is.numeric(data[[deparse(substitute(eventfreq))]])) {
stop("eventfreq field must be numeric")
} else if (anyNA(data[[deparse(substitute(eventfreq))]])) {
stop("eventfreq field must not have any missing values")
}
if (missing(ageband)) {
stop(paste0("function calculate_dsr requires an ageband column ",
"to be specified when independent_events is FALSE"))
} else if (!deparse(substitute(ageband)) %in% colnames(data)) {
stop("ageband is not a field name from data")
} else if (anyNA(data[[deparse(substitute(ageband))]])) {
stop("ageband field must not have any missing values")
}
}
# Calculate DSR and CIs ------------------------------------------------------
if (independent_events) {
## Perform dsr using CI calculation for independent events ----
dsrs <- dsr_inner(
data = data,
x = x,
n = n,
stdpop = stdpop,
type = type,
confidence = confidence,
multiplier = multiplier
)
} else {
## Perform dsr using CI calculation for non independent events ----
# hard code eventfreq and ageband column names,
# and make sure data grouped by eventfreq
data <- data %>%
rename(
eventfreq = {{ eventfreq }},
ageband = {{ ageband }}
) %>%
group_by(eventfreq, .add = TRUE)
# check grouping variables and remove eventfrequency for use later
grps <- group_vars(data)[!group_vars(data) %in% "eventfreq"]
check_groups <- data |>
group_by(pick(all_of(c(grps, "ageband")))) |>
summarise(
num_n = n_distinct(.data$n),
num_stdpop = n_distinct(.data$stdpop),
.groups = "drop"
) |>
filter(.data$num_n > 1 | .data$num_stdpop > 1)
if (nrow(check_groups) > 0) {
stop(
paste0("There are rows with the same grouping variables and ageband",
" but with different populations (n) or standard populations",
"(stdpop)")
)
}
# get vardsrs for each event frequency and sum up
freq_var <- data %>%
dsr_inner(
x = x,
n = n,
stdpop = stdpop,
type = type,
confidence = confidence,
multiplier = multiplier,
rtn_nonindependent_vardsr = TRUE
) %>%
mutate(freqvars = .data$vardsr * .data$eventfreq^2) %>%
group_by(pick(all_of(grps))) %>%
summarise(
custom_vardsr = sum(.data$freqvars),
.groups = "drop"
)
# summarise total events
event_data <- data %>%
mutate(events = .data$eventfreq * .data$x) %>%
group_by(pick(all_of(c(grps, "ageband", "n", "stdpop")))) %>%
summarise(
x = sum(.data$events, na.rm = TRUE),
.groups = "drop"
)
# calculate overall DSR passing in nonindependent variance
dsrs <- event_data %>%
left_join(freq_var, by = grps) %>%
group_by(pick(all_of(grps))) %>%
dsr_inner(
x = x,
n = n,
stdpop = stdpop,
type = type,
confidence = confidence,
multiplier = multiplier,
use_nonindependent_vardsr = TRUE
)
}
return(dsrs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.