# Script started 3/25/2020 by martin holdrege
# functions to support TURN calculation (mostly used in the 02_TUR...script)
# other functions are found in the "functions.R" script
# namespace imports for use by roxygen2 -----------------------------------
# functions from these packages freely available for functions in turnr
# to use
#' @import dplyr
#' @importFrom stats weighted.mean
#' @importFrom utils data
create_epidates <- function(date_vec) {
# args:
# date_vec--date vector the covers whole length of time of interest
# returns:
# dataframe with epiweek, epiyear, and epidate (date of middle of epiweek)
# covers date range of date_vec (plus a week on either side)
stopifnot(
class(date_vec) == "Date"
)
# slightly expanded range of dates
# using longer range of dates so have epidate be middle date of week for even
# partial epiweeks at beginning and end
dates_longer <- tibble(date = seq(from = min(date_vec)-14,
to = max(date_vec) +14, by = 1))
epi_dates <- dates_longer %>%
mutate(epiweek = lubridate::epiweek(date),
epiyear = lubridate::epiyear(date)) %>%
group_by(.data$epiyear, .data$epiweek) %>%
summarize(epidate = mean(.data$date)) %>%
ungroup()
# get rid of first/last week b/ those may be partial (ie epidate not
# actually middle of the epiweek)
out <- epi_dates %>%
filter(epidate != min(epidate), epidate != max(epidate))
out
}
# mean weighted by TUR -------------------------------------------------------
mean_weighted_TUR <- function(x, wt, na.rm = TRUE) {
# args:
# x- numeric vector to take weighted mean of
# wt--numeric vector (to be used as weight)--usually TUR or similar
# returns:
# weighted mean
wt <- wt/sum(wt, na.rm = TRUE)
out <- weighted.mean(x = x, w = wt, na.rm = na.rm)
out
}
# testing
if (FALSE) {
x <- 1:10
wt1 <- rep(2, 10)
wt2 <- c(rep(0, 9), 1)
# should be true
mean_weighted_TUR(x, wt1) == 5.5
mean_weighted_TUR(x, wt2) == 10
}
# site to daily TUR ---------------------------------------------------------
site_2_dly <- function(df, group_vars) {
# args:
# df--site level daily data, with columns including "date",
# "InstrumentVersion", "daily_TUR_rp", "daily_TUR_all", "n_active"
# group_vars--un-quoted names of columns to group by e.g
# group_vars = vars(date, InstrumentVersion)
# returns:
# data frame with daily_TUR_rp, daily_TUR_all and n_seerial_3mo summed
# to the level of the grouping variable
required_cols <- c("date", "InstrumentVersion", "daily_TUR_rp",
"daily_TUR_all", "n_active")
check_cols(df, required_cols)
check_quosures(group_vars)
out <- df %>%
# this right join may not be necessary--just precaution so no missing dates
group_by(!!!group_vars) %>%
summarize(daily_TUR_rp = sum(daily_TUR_rp, na.rm = TRUE), # rp only
# rp and non:
daily_TUR_all = sum(daily_TUR_all, na.rm = TRUE),
n_active = sum(n_active, na.rm = TRUE)) %>%
ungroup()
out
}
# site to daily pathogens ----------------------------------------------------
site_2_dly_path <- function(df, group_vars) {
# args:
# df--site level daily data of pathogens, with columns including "date",
# "InstrumentVersion", "TargetName", "daily_count"
# group_vars--un-quoted names of columns to group by e.g
# group_vars = vars(date, InstrumentVersion)
# returns:
# data frame with daily_count which is the sum of daily count within
# groups
required_cols <- c("date", "InstrumentVersion", "TargetName",
"daily_count")
check_cols(df, required_cols)
check_quosures(group_vars)
out <- df %>%
# this right join may not be necessary--just precaution so no missing dates
group_by(!!!group_vars) %>%
summarize(daily_count = sum(daily_count, na.rm = TRUE)) %>%
ungroup()
out
}
# -------------------------------------------------------------------------
dly_2_epi <- function(df, group_vars) {
# args:
# df--dataframe of daily TUR data that
# group_vars--unquoted names of columns to group by wrapped in vars() function
# returns:
# df with "daily_TUR_rp", "daily_TUR_all", "n_active" cols aggregated to epiweek
required_cols <- c("daily_TUR_rp", "daily_TUR_all", "n_active", "date")
check_cols(df, required_cols)
check_quosures(group_vars)
epi_dates <- create_epidates(df$date) # df of epidates/weeks
out <- df %>%
mutate(epiweek = lubridate::epiweek(date),
epiyear = lubridate::epiyear(date)) %>%
group_by(!!!group_vars) %>%
summarize(epi_TUR_rp = sum(daily_TUR_rp, na.rm = TRUE),
epi_TUR_all = sum(daily_TUR_all, na.rm = TRUE),
n_active = mean(n_active, na.rm = TRUE),
epi_n_days = lu(date)) %>%
ungroup() %>%
left_join(epi_dates, by = c("epiweek", "epiyear"))
out
}
# daily to epi week pathogen data---------------------------------------------
dly_2_epi_path <- function(df, group_vars) {
# args:
# df--dataframe of daily counts by pathogens (daily_count)
# group_vars--unquoted names of columns to group by wrapped in vars() function
# returns:
# df with epicount (replacing input of dail_count). which is aggregated to epiweek
required_cols <- c("date", "InstrumentVersion", "TargetName",
"daily_count")
check_cols(df, required_cols)
check_quosures(group_vars)
epi_dates <- create_epidates(df$date) # df of epidates/weeks
out <- df %>%
mutate(epiweek = lubridate::epiweek(date),
epiyear = lubridate::epiyear(date)) %>%
group_by(!!!group_vars) %>%
summarize(epi_count = sum(daily_count, na.rm = TRUE)) %>%
ungroup()%>%
left_join(epi_dates, by = c("epiweek", "epiyear"))
out
}
# arrange by epidate ------------------------------------------------------
arrange_epidate <- function(df) {
# args:
# df--dataframe (often grouped), with epidate column
# returns:
# df sorted by epidate (within each group), throws an error if epidates
# are not all 7 days apart (e.g a week missing).
# for use in other functions
stopifnot(
is.data.frame(df),
"epidate" %in% names(df)
)
out <- arrange(df, epidate, .by_group = TRUE) # sorting
check <- out %>%
summarize(is_week_interval = all(diff(epidate)==7))
if(!all(check$is_week_interval)) {
stop("non-consecutive epidates present")
}
out
}
# calc_n_active_adj -------------------------------------------------------
calc_n_active_adj <- function(df, group_vars, width = 52) {
# args:
# df
#
# returns:
#
required_cols <- c("epidate", "epi_TUR_rp", "epi_TUR_all", "n_active")
check_cols(df, required_cols)
check_quosures(group_vars)
out <- df %>%
group_by(!!!group_vars) %>%
arrange_epidate() %>%
mutate(
sum_rp_long = zoo::rollapply(epi_TUR_rp, width = width, FUN = sum,
na.rm = TRUE, partial = TRUE),
sum_all_long = zoo::rollapply(epi_TUR_all, width = width, FUN = sum,
na.rm = TRUE, partial = TRUE),
prop_rp = sum_rp_long/sum_all_long,
# this next line is necessary when doing by region
# otherwise subsequent weighted averages are thrown off
prop_rp = ifelse(epi_TUR_rp == 0, NA, prop_rp),
n_active_adj = prop_rp*n_active, # Adjust number of active instruments
#Week X RPTests/(Active Instruments week x prop_rp):
Y = epi_TUR_rp/n_active_adj)
out
}
# calculate Y prime -------------------------------------------------------
calc_Y_2prime <- function(df, group_vars, means, Y, Y_prime) {
# args:
# df--dataframe with for which Y` and Y" (all_adj) needs to be calculated
# group_vars--variables to group by when taking weighted means to calculate Y"
# means--names vector of means of Y of instrument versions
# Y--string, name of column containg Y prime variable
# Y_prime--string, name to give the Y_prime variable
# returns:
# df new column with Y prime, and new instrument version (all_adj) for which
# Y_prime col is actually Y"
required_cols <- c("InstrumentVersion", "epi_n_days", "n_active")
check_cols(df, required_cols)
stopifnot(
check_quosures(group_vars),
is.numeric(means),
is.character(Y),
is.character(Y_prime)
)
# calculating Y prime
df[[Y_prime]] <- FA1.5_adjust(x = df[[Y]], InstrumentVersion = df$InstrumentVersion,
FA1.5_mean = means["FA1.5"],
FA2.0_mean = means["FA2.0"],
Torch_mean = means["Torch"])
# calculating Y double prime--still called Y_prime but under
# instrumentversion = "all_adj"
df_adj <- df %>%
filter(InstrumentVersion != "all") %>%
group_by(!!!group_vars) %>%
# epi_n_days same for all inst version--but just want to keep for late ruse
summarize(epi_n_days = mean(epi_n_days),
!!Y_prime := weighted.mean(x = .data[[Y_prime]],
wt = n_active, na.rm = TRUE),
InstrumentVersion = "all_adj") %>%
ungroup()
# adding back together
out <- bind_rows(df, df_adj)
out
}
# add_missing_dates_2_site_path -------------------------------------------
add_missing_dates_2_site_path <- function(df) {
# args:
# df--data frame of daily counts of by TargetName, instrumentversion and SiteID
# returns:
# dataframe with same columns as input but with no missing dates within the
# range of dates found for each SiteID/instrumentversion combination
# also TargeName now ordered factor
required_cols <- c("date", "SiteID", "daily_count", "TargetName",
"InstrumentVersion")
check_cols(df, required_cols)
all_pathogens <- sort(unique(df$TargetName))
split_list1 <- split(df, df$SiteID) # split by site id
# iterate over sites
df_split2 <- purrr::map(split_list1, function(df) {
site_instrument <- split(df, df$InstrumentVersion)
# "loop" of instrument versions for the site
site_instrument2 <- purrr::map2(site_instrument, names(site_instrument), function(df2, inst){
dates <- seq(from = min(df2$date), to = max(df2$date), by = "1 day")
# all combos of date/pathogen
dates_paths <- expand.grid(date = dates,
TargetName = all_pathogens,
stringsAsFactors = FALSE)
out <- left_join(dates_paths, df2, by = c("date", "TargetName"))
# left join leaves NAs--filling back in:
out$InstrumentVersion <- inst
id <- unique(out$SiteID)
out$SiteID <- id[!is.na(id)]
out
}) %>%
bind_rows()
})
out <- df_split2 %>%
bind_rows() %>%
as_tibble()
# change TargetName to a factor--want specific ordering, and
# code setup like this so runs w/ and w/o co-detection category
others <- sort(all_pathogens[all_pathogens %in% c("negative", "co-detection")],
decreasing = TRUE)
all_pathogens <- c(others, sort(all_pathogens[!all_pathogens %in% others],
decreasing = TRUE))
out$TargetName <- factor(out$TargetName,
levels = all_pathogens)
out
}
# calc_path_3wma ----------------------------------------------------------
calc_path_Y_3wma <- function(df,
means,
group_2prime,
group_3wma,
add_path_dates = FALSE
) {
# args:
# df with epi_count (count per epiweek) by TargetName.
# means--named vector of means of the 3 instrument versions
# group_2prime--names of columns wrapped in vars() to group by when calculating
# Y", passed on to calc_Y_2prime function
# group_3wma--names of columns wrapped in vars() to group by when calculating
# three week moving average
# add_path_dates--logical whether to fill in date gaps with NA when smoothing Y".
# may slow down the function. Can be needed when if get, error for non
# continuous epidates (there are some cases when data from
# some instrument versions has been removed where this is needed)
# returns:
# dataframe with path_Y, path_Y_prime and path_Y_prime_3wma columns, ie
# this is Y, Y' and Y'' for each pathogen. Y'' is "path_Y_prime" column
# when InstrumentVersion = all_adj
required_cols <- c("epidate", "epi_count", "epi_n_days", "n_active_adj")
check_cols(df, required_cols, "calc_path_Y_3wma")
stopifnot(
is.numeric(means),
check_quosures(group_3wma)
)
df2 <- df %>%
# count/instrument
mutate(path_Y = epi_count/n_active_adj) %>%
calc_Y_2prime(group_vars = group_2prime,
means = means, Y = "path_Y", Y_prime = "path_Y_prime")
# solving a problem of non-continuous epidates, occurs when
# instrument versions do not overlap (problem for ID) in dates when
# smoothing Y"--makes code substantially slower I think.
if (add_path_dates) {
stopifnot(c("epidate", "TargetName", "InstrumentVersion") %in% names(df2))
cols2expand <- unique(c("epidate", "TargetName", quo2char(group_3wma)))
combo <- expand_cols(df2, cols = cols2expand)
df2 <- right_join(df2, combo, by = cols2expand) %>%
# adding epiweek/epiyear back in
mutate(epiweek = lubridate::epiweek(.data$epidate),
epiyear = lubridate::epiyear(.data$epidate))
}
out <- df2 %>%
group_by(!!!group_3wma) %>%
arrange_epidate() %>%
# 3 week moving average
mutate(path_Y_prime_3wma = rollapply_epi(x = .data[["path_Y_prime"]],
n_days = .data[["epi_n_days"]],
na.rm = TRUE),
# b/na.rm = TRUE can calculate moving windows when that week
# is missing, want to switch those back to NA
path_Y_prime_3wma = ifelse(is.na(.data$path_Y_prime), NA,
.data$path_Y_prime_3wma)) %>%
ungroup()
out
}
# FA1.5 adjust ------------------------------------------------------------
FA1.5_adjust <- function(x, InstrumentVersion, FA1.5_mean, FA2.0_mean,
Torch_mean) {
# args:
# x--numeric vector (TUR per instrument)
# InstrumentVersion--character vector
# FA1.5_mean--mean values (TUR/instrument) of 1.5 instruments (reference)
# FA2.0_mean--mean of 2.0 instruments
# Torch_mean--mean of torch instruments
# returns:
# adjust vector so that 2.0 and Torch has same mean as the 1.5, doesn't
# adjust all or the 1.5
stopifnot(InstrumentVersion %in% c("FA1.5", "FA 1.5", "FA2.0", "FA 2.0",
"Torch", "all"),
length(InstrumentVersion) == length(x))
# switched to warning b/ in some cases may have a panel that isn't
# present in particular instrument version
if(is.na(FA1.5_mean)) {
warning("FA1.5_mean is NA, now will adjust to grand mean instead of to FA1.5")
FA1.5_mean <- mean(FA2.0_mean, Torch_mean)
}
if(is.na(FA2.0_mean)) {
warning("FA2.0_mean is NA")
}
if(is.na(Torch_mean)) {
warning("Torch_mean is NA")
}
out <- ifelse(InstrumentVersion %in% c("FA2.0", "FA 2.0") , x*FA1.5_mean/FA2.0_mean,
ifelse(InstrumentVersion == "Torch", x*FA1.5_mean/Torch_mean, x))
out
}
# number of sites in a given epiweek --------------------------------------
#' Calculate number of unique SiteIDs per epiweek
#'
#' Given daily site level data, calculate the number of unique SiteIDs
#' for each epiweek.
#'
#' @param df This is a dataframe containing daily test utilization data
#' must have columns of "daily_TUR_rp","SiteID", "date", as well as
#' columns to group by
#' @param group_vars columns to group by (e.g. region). Unquoted names of columns enclosed in vars(),
#' or NULL
#' @param window Width (in weeks) of window over which to count how many
#' sites are present.
#' @return dataframe with SiteID, date, epiweek, epiyear, epidate, n_sites
#' (number of sites that week), and other grouping variables
#' @examples
#' # see ?TUR_dat
#' calc_n_sites(TUR_dat)
#' calc_n_sites(TUR_dat, group_vars = vars(InstrumentVersion))
#' @export
calc_n_sites <- function(df,
group_vars = NULL,
window = 3) {
required_cols <- c("daily_TUR_rp","SiteID", "date")
check_cols(df, required_cols)
stopifnot(is.null(group_vars) || check_quosures(group_vars))
epidates <- create_epidates(df$date)
group_vars2 <- c(vars(epiweek, epiyear), group_vars)
df2 <- df %>%
filter(!is.na(daily_TUR_rp) & daily_TUR_rp > 0) %>%
mutate(epiweek = lubridate::epiweek(date),
epiyear = lubridate::epiyear(date)) %>%
group_by(!!!group_vars2)
df2expand <- right_join(df2, epidates, by = c("epiweek", "epiyear"))
if (is.null(group_vars)) {
cols2expand <- "epidate"
} else {
cols2expand <- quo2char(group_vars)
# epiweek/epiyear shouldn't be included in group_vars
if(any(c("epiweek", "epiyear", "epidate") %in% quo2char(group_vars))) {
stop("group_vars shouldn't included epiweek, epiyear or epidate")
}
# want to use epidate so don't all combinations of year and week
# can't have gaps in dates for rolling window
cols2expand <- c(cols2expand, "epidate")
}
# for joining so gaps don't exist
df2join <- expand_cols(df2expand, cols = cols2expand)
out <- df2 %>%
nest() %>%
mutate(SiteIDs = purrr::map(.data$data, function(x) unique(x$SiteID))) %>%
select(-.data$data) %>%
left_join(epidates, by = c("epiweek", "epiyear")) %>%
ungroup() %>%
select(-c("epiweek", "epiyear")) %>%
right_join(df2join, by = cols2expand) %>%
group_by(!!!group_vars) %>%
arrange_epidate() %>%
mutate(sites_list = roll_list(.data$SiteIDs, window = window, align = "center"),
# number of unique serial numbers within 3 months
n_sites = purrr::map_dbl(.data$sites_list, function(x) lu(unlist(x))),
epiweek = lubridate::epiweek(.data$epidate),
epiyear = lubridate::epiyear(.data$epidate)
) %>%
select(-c("sites_list", "SiteIDs")) %>%
ungroup()
out
}
# calc_TURN ---------------------------------------------------------------
#' Calculate TURN
#'
#' Calculates test utilization rate normalization (Y''), using daily site level
#' data. Y (tests/adjusted active instruments) is calculated for each
#' InstrumentVersion, then Y' prime is calculated to adjust all means to FA1.5.
#' The weighted average of these Y' is taken, and three week moving average
#' applied.
#'
#' @param df Dataframe with columns of "date", "InstrumentVersion",
#' "daily_TUR_rp" (number of RP tests that day),
#' "daily_TUR_all" (number of all tests that day),
#' "n_active" (active instruments).
#' @param group_vars columns to group by. Unquoted names of columns
#' enclosed in vars(). Must at least include InstrumentVersion (which would then
#' result in a national level TURN)
#' @param site_info NULL or dataframe (if additonal group_vars are needed).
#' must contain a SiteID column, so that it can be merged with df. Additional
#' columns (e.g. Region) need to be present in the site_info dataframe
#' to be used as group_vars.
#' @param means NULL or named vector of means. The vector needs to be length of
#' three giving the means of Y (tests/adjusted active instrument) for the
#' FA1.5, FA2.0 and Torch system (with those names provided as names of
#' elements in the vector). Leave as NULL when calculating National
#' level TURN, because then the means are calculated internally in this
#' function.
#' @param return_means logical, whether to return vector of means of Y
#' (tests/adjusted active instrument). If true, output is a list with
#' first element being the output dataframe, the second being named
#' vector of means (which can then be used as an input to this function
#' when calculating regional TURN)
#' @return Dataframe or list. The dataframe contains the following columns:
#' epiweek, epiyear,
#' InstrumentVersion (the instrument version, or 'all' for all instruments
#' combined, or all_adj for which the Y_prime column is the weighted average
#' of all the Y_prime for the other instruments or what we are calling Y''),
#' epi_TUR_rp (number of RP test that week),
#' epi_TUR_all (number of all tests that week) ,
#' n_active (number of active instruments),
#' epi_n_days (number of days that week), epidate (calander date of the middle
#' of the epidemiological week), sum_rp_long (number of rp tests within 1 year),
#' sum_all_long (number of all tests within 1 year),
#' prop_rp (proportion RP tests = sum_rp_long/sum_all_long),
#' n_active_adj (number of active instruments*prop_rp), Y
#' (rp tests/adjust active instruments), Y_prime (Y adjusted to mean of FA1.5,
#' or when InstrumentVersion == all_adj, this is the weighted average
#' of Y prime for the other instruments), Y_prime_3wma (3 week moving average
#' applied to Y_prime).
#'
#' If a list is returned (i.e. return_means = TRUE) the first element is the
#' aformentioned dataframe, the second element is the vector of mean Y for
#' each instrument version.
#' @examples
#' l <- calc_TURN(TUR_dat, return_means = TRUE)
#' # mean of Y for each instrument version
#' means <- l$means
#' # data frame containg national turn
#' national_turn <- l$df
#' # plot of final TURN curve
#' # plot(Y_prime_3wma ~ epidate,
#' # data = national_turn[national_turn$InstrumentVersion == "all_adj", ],
#' # type = "l")
#' # calculating TURN by region
#' state_turn <- calc_TURN(TUR_dat,
#' group_vars = dplyr::vars(InstrumentVersion, Region),
#' site_info = get_site_info(rp_raw),
#' means = means)
#' @export
calc_TURN <- function(df,
group_vars = vars(InstrumentVersion),
site_info = NULL,
means = NULL,
return_means = FALSE){
check_quosures(group_vars)
required_cols <- c("date", "InstrumentVersion", "daily_TUR_rp",
"daily_TUR_all", "n_active")
check_cols(df, required_cols, "calc_TURN()")
stopifnot(is.null(site_info) | is.data.frame(site_info),
is.logical(return_means))
# I'm not sure if having redundant group_vars causes problems
# so I'm not allowing
not_allowed_groups <- c("date", "epiweek", "epiyear")
group_char <- quo2char(group_vars)
stopifnot("InstrumentVersion" %in% group_char)
if(any(not_allowed_groups %in% group_char)) {
stop(paste0("group_vars should not include any of\n",
paste(not_allowed_groups, collapse = "\n")))
}
if(is.null(means) & length(group_char) != 1) {
stop("named vector of means for each instrument version must be supplied
when additional group_var variables are used")
}
# grouping by date, and other variables provided by user
dly_group_vars <- c(vars(date), group_vars)
if (!is.null(site_info)) {
if(!"SiteID" %in% names(df) | !"SiteID" %in% names(site_info)) {
stop("SiteID column must be in df and site_info dataframes")
}
df <- df %>%
left_join(site_info, by = "SiteID")
}
if(!all(group_char %in% names(df))) {
stop("all group_vars must be included in either df or site_info")
}
dly <- site_2_dly(df, group_vars = dly_group_vars)
# dataframe with every combination of date and group variables
# so no holes in the dates
df2join <- expand_cols(dly, cols = c("date", group_char))
# aggregating to epiweek
epi1 <- dly %>%
# this right join may not be necessary--just precaution so no missing dates
right_join(df2join, by = c("date", group_char)) %>%
dly_2_epi(group_vars = c(vars(epiweek, epiyear), group_vars))
# adjusted active instruments
epi2 <- calc_n_active_adj(epi1, group_vars = group_vars)
# calculate means by instrument version (when just aggregating to national
# level)
if(is.null(means)) {
# it might be possible to also calculate just calculate means
# on the fly when other grouping vars included but playing it safe
mean_epi <- epi2 %>%
filter(.data$InstrumentVersion != "all") %>%
group_by(.data$InstrumentVersion) %>%
summarize(mean = mean(.data$Y, na.rm = TRUE))
means <- mean_epi$mean
names(means) <- mean_epi$InstrumentVersion
}
# adding in grouping variables (except InstrumentVersion--don't want'
# to group by instrument version when taking weighted mean)
group_vars_2prime <- c(vars(epiyear, epiweek, epidate),
group_vars[which(group_char != "InstrumentVersion")])
epi4 <- calc_Y_2prime(epi2, group_vars = group_vars_2prime,
means = means, Y = "Y", Y_prime = "Y_prime")
# 3 week moving averages
epi5 <- epi4 %>%
group_by(!!!group_vars) %>%
arrange_epidate() %>%
# 3 week moving average
mutate(Y_prime_3wma = rollapply_epi(x = .data$Y_prime,
n_days = .data$epi_n_days,
width = 3,
na.rm = TRUE),
# b/na.rm = TRUE can calculate moving windows when that week
# is missing, want to switch those back to NA
Y_prime_3wma = ifelse(is.na(.data$Y_prime), NA, .data$Y_prime_3wma)) %>%
ungroup()
if(return_means) {
out <- list("df" = epi5, means = means)
} else {
out <- epi5
}
out
}
# calc_path_TURN ----------------------------------------------------------
# next: make examples. make documentation.
# add checks--ie sums should add, and should work missing dates
# try tricky version where add_path_dates is required.
# add optional check for sum count vs TURN
#' Calculate TURN by pathogen
#'
#' Calculates test utilization rate normalization (Y'') for each pathogen,
#' using daily site level data. See \code{?calc_TURN} for details on how TURN
#' is calculated (that function is for caclulating overall/total TURN--not for
#' a specific pathogen).
#'
#' @param df dataframe of daily pathogen data. Columns must include "date",
#' "InstrumentVersion", "daily_count", "TargetName", "SiteID"
#' @param TURN_df dataframe created by \code{calc_TURN()}. Must include
#' columns of "epidate", "n_active", "n_active_adj", "epi_n_days", "Y",
#' "Y_prime", "Y_prime_3wma", as well as columns listed in group_vars
#' argument.
#' @param means named vector of means. The vector needs to be length of
#' three giving the means of Y (tests/adjusted active instrument) for the
#' FA1.5, FA2.0 and Torch system (with those names provided as names of
#' elements in the vector). This should be created by the \code{calc_TURN()}
#' function when aggregating to the national level.
#' @param group_vars columns to group by. Unquoted names of columns
#' enclosed in vars(). Must at least include InstrumentVersion (which would then
#' result in a national level TURN)
#' @param site_info NULL or dataframe (if additonal group_vars are needed).
#' must contain a SiteID column, so that it can be merged with df. Additional
#' columns (e.g. Region) need to be present in the site_info dataframe
#' to be used as group_vars.
#' @param add_path_dates logical whether to fill in date gaps with NA when
#' smoothing Y".
#' May slow down the function. Can be needed when if get error for non
#' continuous epidates (there are some cases when data from
#' some instrument versions has been removed where this is needed)
#' @param run_check logical, whether to run a check to see if the sum of individual
#' pathogens TURNs equals the total TURN, for each epiweek. This check
#' is only relevant when co-detections and negatives are pathogen categories,
#' in which case you would expect the check to not throw an error.
#' @return Dataframe. The dataframe contains the following columns:
#' epiweek, epiyear, epidate (date of middle of epiweek), epi_count
#' (count of number of detections of pathogen that epiweek),
#' TargetName (pathogen), path_Y
#' (positives for that pathogen/adjust active instruments),
#' path_Y_prime (path_Y adjusted to mean of FA1.5,
#' or when InstrumentVersion == all_adj, this is the weighted average
#' of path Y prime for the other instruments).
#' Y_prime_3wma (3 week moving average
#' applied to path_Y_prime, this is TURN for pathogen when
#' InstrumentVersion == 'all_adj'). Other columns as supplied in TURN_df
#' and site_info
#' @examples
#' # list of national turn and means by instrument (means should be
#' # calculated at national level)
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # regional (state) TURN
#' site_info <- get_site_info(rp_raw)
#' TURN_region <- calc_TURN(TUR_dat,
#' group_vars = vars(InstrumentVersion, Region),
#' site_info = site_info,
#' means = means)
#'
#' # calculate TURN for each pathogen -- national
#' calc_path_TURN(df = path_dat,
#' TURN_df = TURN_national,
#' means = means)
#'
#' # calculate TURN for each pathogen -- regional
#' calc_path_TURN(df = path_dat,
#' TURN_df = TURN_region,
#' means = means,
#' group_vars = vars(InstrumentVersion, Region),
#' site_info = site_info)
#'
# TURN by pathogen--national level
#' @export
calc_path_TURN <- function(df, TURN_df, means,
group_vars = vars(InstrumentVersion),
site_info = NULL,
add_path_dates = FALSE,
run_check = FALSE){
# checks on inputs ~~~~~
required_cols <- c("date", "InstrumentVersion", "daily_count", "TargetName",
"SiteID")
check_cols(df, required_cols)
stopifnot(is.null(site_info) | is.data.frame(site_info),
is.numeric(means),
is.logical(add_path_dates),
is.logical(run_check))
check_quosures(group_vars)
group_char <- quo2char(group_vars)
check_cols(TURN_df,
# prob don't need to require Y Y_prime and Y_prime_3wma
# but keeping the requirement for now, b/ joined
# for output
required_cols = c("epidate", "n_active", "n_active_adj",
"epi_n_days", "Y", "Y_prime", "Y_prime_3wma",
"epi_TUR_rp", group_char),
name = "TURN_df")
# I'm not sure if having redundant group_vars causes problems
# so I'm not allowing
not_allowed_groups <- c("date", "epiweek", "epiyear", "TargetName")
stopifnot("InstrumentVersion" %in% group_char)
if(any(not_allowed_groups %in% group_char)) {
stop(paste0("group_vars should not include any of\n",
paste(not_allowed_groups, collapse = "\n")))
}
if(is.null(means) & length(group_char) != 1) {
stop("named vector of means for each instrument version must be supplied
when additional group_var variables are used")
}
# end checks on inputs ~~~~
# rolling windows don't work when missing dates aren't filled in
df2 <- add_missing_dates_2_site_path(df)
# join in site info
if (!is.null(site_info)) {
if(!"SiteID" %in% names(df2) | !"SiteID" %in% names(site_info)) {
stop("SiteID column must be in df and site_info dataframes")
}
df2 <- left_join(df2, site_info, by = "SiteID")
}
if(!all(group_char %in% names(df2))) {
stop("all group_vars must be included in either df or site_info")
}
# daily pathogen counts-
path_dly <- site_2_dly_path(df2, c(vars(date, TargetName), group_vars))
# pathogens counts at epi week level
path_epi1 <- dly_2_epi_path(
path_dly, c(vars(epiyear, epiweek, TargetName), group_vars))
# adding in active instruments
path_epi2 <- TURN_df %>%
# We need to make new all_adj based on individual pathogens, so deletiing here
filter(.data$InstrumentVersion != "all_adj") %>%
# needed columns
select(c("epidate", "n_active", "n_active_adj", "epi_n_days", "epi_TUR_rp",
group_char)) %>%
right_join(path_epi1, by = c("epidate", group_char))
# check join integrity
# i think it is ok to have epi_count = 0 when n_active_adj is NA
if (any(is.na(path_epi2$n_active_adj) &
!(is.na(path_epi2$epi_count) | path_epi2$epi_count == 0))
){
stop("n_active_adj is NA when number expected")
}
# ok to do only b/ of above check (0s can happen when summing across all NAs)
path_epi2 <- path_epi2 %>%
mutate(epi_count = ifelse(is.na(.data$n_active_adj),
NA, .data$epi_count))
path_epi5a <- calc_path_Y_3wma(
df = path_epi2,
means = means,
# don't want to group by instrument version here
group_2prime = c(vars(epiyear, epiweek, epidate, TargetName),
group_vars[group_char != "InstrumentVersion"]),
group_3wma = c(vars(TargetName), group_vars),
add_path_dates = add_path_dates)
# adding in y_prime now so that have it for all_adj
out <- TURN_df %>%
select("epidate","Y", "Y_prime", "Y_prime_3wma", group_char) %>%
right_join(path_epi5a, by = c("epidate", group_char))
# final checks
cols2check <- c("epidate", "TargetName", group_char)
if(sum(is.na(out[, cols2check])) != 0) {
stop("At least one of following columns has missing values:\n",
paste(cols2check, collapse = "\n"),
"\nensure that no levels of group_vars in TURN_df are missing")
}
# check that path turns sum to total turn
if(run_check) {
check_path_TURN(out, group_vars = group_vars)
}
out
}
# calculate detection rate ------------------------------------------------
#' Calculate detection rate of pathogens
#'
#' Calculate the percentage of total tests that are of a specific pathogen
#' using the output from the \code{calc_path_TURN} function. It is calculated
#' as Y for the pathogen divided by total Y. Since the Ys are calculate with
#' the same denominator this is equivelant to tests for that pathogen/total tests.
#' A three week moving average is applied to the numerator and denominator
#' prior to calculation.
#'
#' @param df dataframe with at least columns of "InstrumentVersion", "epi_n_days",
#' "Y", "path_Y", "epidate", and any other columns included in group_vars.
#' @param group_vars names of columns to group by, wrapped in vars(). Must
#' included TargetName and can also include other columns to group by (e.g. a
#' column defining regions).
#' @param return_week logical whether output should include epiweek and
#' epiyear columns
#' @param window width of the window over which to apply rolling window sum
#' to calculate detection rate.
#' @return data frame that includes a column 'det_rate', which is the detection
#' rate (in percent), and TUR_mws which is the moving window sum of TUR
#' @examples
#' ###############################################
#' ############## Generating data ################
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # regional (state) TURN
#' site_info <- get_site_info(rp_raw)
#' TURN_region <- calc_TURN(TUR_dat,
#' group_vars = vars(InstrumentVersion, Region),
#' site_info = site_info,
#' means = means)
#'
#' # calculate TURN for each pathogen -- national
#' path_turn_national <- calc_path_TURN(df = path_dat,
#' TURN_df = TURN_national,
#' means = means)
#'
#' # calculate TURN for each pathogen -- regional
#' path_turn_region <- calc_path_TURN(df = path_dat,
#' TURN_df = TURN_region,
#' means = means,
#' group_vars = vars(InstrumentVersion, Region),
#' site_info = site_info)
#'
#' ###############################################
#' ############## detection rate ################
#'
#' # calculate detection rate--national
#' calc_det_rate(path_turn_national)
#' # calculate detection rate--regional
#' calc_det_rate(path_turn_region, group_vars = vars(TargetName, Region))
#' @export
calc_det_rate <- function(df, group_vars = vars(TargetName), return_week = TRUE,
window = 3) {
check_quosures(group_vars)
required_cols <- c("InstrumentVersion", "epi_count", "epi_TUR_rp",
"epidate", quo2char(group_vars))
check_cols(df, required_cols, "calc_det_rate()")
if (!"TargetName" %in% quo2char(group_vars)) {
warning("group_vars should include TargetName")
}
out <- df %>%
filter(.data$InstrumentVersion == "all") %>%
group_by(!!!group_vars) %>%
arrange_epidate() %>%
# moving window sum of count
mutate(count_mws = zoo::rollapply(epi_count, width = window, FUN= sum,
na.rm = TRUE, partial = TRUE),
# TUR moving window sum
TUR_mws = zoo::rollapply(epi_TUR_rp, width = window, FUN= sum,
na.rm = TRUE, partial = TRUE),
det_rate = count_mws/TUR_mws*100) %>%
select(c("epidate", quo2char(group_vars), "det_rate", "TUR_mws")) %>%
ungroup()
if(return_week) {
out <- out %>%
mutate(epiweek = lubridate::epiweek(.data$epidate),
epiyear = lubridate::epiyear(.data$epidate))
}
out
}
# clean TURN output -------------------------------------------------------
#' clean TURN dataframe
#'
#' Output a simpler (fewer columns) dataframe, given an input dataframe
#' made by either \code{calc_TURN()} or \code{calc_path_TURN()} functions.
#' New TURN column is returned, which is the the Y_prime_3wma input column, where
#' InstrumentVersion == 'all_adj'.
#'
#' @param df dataframe with at least columns of "InstrumentVersion", "epi_n_days",
#' "Y", "path_Y", "epidate", and any other columns included in group_vars.
#' @param extra_cols character vector of additional columns to keep in output
#' @param is_path logical, whether dataframe is of pathogen level data
#' (i.e. created in \code{calc_path_TURN()}).
#' @return dataframe that includes TURN column (final normalization). It also
#' includes path_TURN (the TURN for individual pathogens, when the
#' is_path argument is TRUE)
#' @examples
#'
#' ############## Generating data ################
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # calculate TURN for each pathogen -- national
#' path_turn_national <- calc_path_TURN(df = path_dat,
#' TURN_df = TURN_national,
#' means = means)
#'
#' ##### creating cleaner/simple dataframe #######
#'
#' clean_TURN_output(TURN_national)
#' clean_TURN_output(path_turn_national, is_path = TRUE)
#' @export
clean_TURN_output <- function(df, extra_cols = NULL, is_path = FALSE) {
required_cols <- c("epidate", "epiweek", "epiyear", "Y_prime_3wma",
"InstrumentVersion")
check_cols(df, required_cols, "clean_TURN_output()")
df2 <- filter(df, .data$InstrumentVersion == "all_adj")
df2 <- rename(df2, "TURN" = "Y_prime_3wma")
cols2keep <- c("epidate", "epiweek", "epiyear", extra_cols, "TURN")
if (is_path){
cols2keep <- c(cols2keep, "path_TURN", "TargetName")
df2 <- rename(df2, "path_TURN" = "path_Y_prime_3wma")
}
out <- df2[, cols2keep]
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.