#' Describes a numeric variable by strata of another numeric grouping variable.
#'
#' This has similar functionality to `tapply` or the dplyr chain `group_by` %>%
#' `summarise`. It offers additional flexilibity over `ds.tapply` in that it
#' allows you to specify upper and lower values for each strata. By contrast,
#' ds.tapply will produce a summary based on every unique value of the grouping
#' variable, which may not always be what is required.
#'
#' @importFrom DSI datashield.connections_find datashield.assign
#' @importFrom dsBaseClient ds.meanSdGp
#' @importFrom purrr map
#' @importFrom tibble as_tibble
#' @importFrom dplyr bind_rows %>% slice
#' @importFrom tidyr separate
#'
#' @template conns
#' @template df
#' @param outcome String specifying outcome variable within `df`.
#' @param group_var String specifying grouping variable within 'df'.
#' @param intervals Optionally, numeric vector defining how to stratify
#' `group_var`. Values should specify alternately lower and upper values for
#' each strata. If NULL, `outcome` is summarised by every unique value of
#' `group_var`.
#' @template checks
#'
#' @return Tibble containing mean values for each strata of `group_var`.
#'
#' @family descriptive functions
#'
#' @export
dh.meanByGroup <- function(df = NULL, outcome = NULL, group_var = NULL,
intervals = NULL, conns = NULL, checks = FALSE) {
value <- op <- tmp <- varname <- new_df_name <- age <- group <- cohort <-
. <- enough_obs <- variable <- level <- std.dev <- nvalid <- ntotal <-
x <- nvalid_2 <- nvalid_1 <- mean_2 <- std.dev_2 <- nmissing <- NULL
if (is.null(df)) {
stop("`df` must not be NULL.", call. = FALSE)
}
if (is.null(outcome)) {
stop("`outcome` must not be NULL.", call. = FALSE)
}
if (is.null(group_var)) {
stop("`group_var` must not be NULL.", call. = FALSE)
}
if (is.null(conns)) {
conns <- datashield.connections_find()
}
if (checks == TRUE) {
.isDefined(df = df, vars = vars, conns = conns)
}
## There is an easy way and a hard way. If we bin based on integer units of
## the binning variable it is quite quick
if (is.null(intervals)) {
## ---- First we round up the age variable -----------------------------------------------
DSI::datashield.assign(
conns, "age_tmp", as.symbol(paste0(df, "$", group_var, "+0.5"))
)
calltext <- call("asIntegerDS", "age_tmp")
DSI::datashield.assign(conns, "age_round", calltext)
## ---- Now we get the mean values for the outcome by age --------------------------------
calltext <- paste0(
"meanSdGpDS(", paste0(df, "$", outcome), ",",
"age_round", ")"
)
mean_tmp <- DSI::datashield.aggregate(conns, as.symbol(calltext))
## ---- Deal with disclosure issues ----------------------------------------
warnings <- .checkDisclosureMeanGp(mean_tmp)
if (nrow(warnings$issues) > 0) {
warning(
"The following cohorts have at least one group with between 1 and 2 observations
so it is not possible to return grouped values. Try collapsing some categories or
re-run the function using the `intervals` argument. \n\n",
paste0(warnings$issues$cohort, collapse = ", ")
)
}
## ---- Now put into neat long format -------------------------------------------------------
out <- warnings$no_issues_data %>%
map(function(x) {
x$Mean_gp %>%
as_tibble(rownames = "age") %>%
mutate(age = as.numeric(str_remove(age, "age_round_")))
}) %>%
bind_rows(.id = "cohort") %>%
dplyr::rename(mean = value)
## ---- Remove temporary objects ------------------------------------------------------------
dh.tidyEnv(
obj = c("age_tmp", "age_round"),
type = "remove",
conns = conns
)
} else if (!is.null(intervals)) {
## ---- This is the harder one -------------------------------------------------------------------
## First we need to create a table defining our age bands.
pairs <- split(intervals, ceiling(seq_along(intervals) / 2))
subnames <- unlist(
pairs %>% map(~ paste0("grp", "_", paste0(., collapse = "_"))),
use.names = FALSE
)
cats <- tibble(
varname = rep(subnames, each = 2),
value = intervals,
op = rep(c(">=", "<="), times = (length(intervals) / 2)),
tmp = ifelse(op == ">=", "gte", "lte"),
new_df_name = paste0("grp", tmp, value)
)
## Now we create vectors of 1s and 0s indicating whether the above criteria
## are met for each subject.
cats %>%
pmap(function(value, op, new_df_name, ...) {
ds.Boole(
V1 = paste0(df, "$", group_var),
V2 = value,
Boolean.operator = op,
newobj = new_df_name,
datasources = conns
)
})
suppressMessages(
assign_conditions <- cats %>%
group_by(varname) %>%
summarise(condition = paste(new_df_name, collapse = "*"))
)
assign_conditions %>%
pmap(function(condition, varname) {
DSI::datashield.assign(conns, varname, as.symbol(condition))
})
## Check disclosure issues
disclosure <- assign_conditions$varname %>%
map(~ .checkDisclosure(., conns = conns)) %>%
set_names(assign_conditions$varname) %>%
bind_rows(.id = "variable")
no_obs <- disclosure %>% dplyr::filter(enough_obs == FALSE)
if (nrow(no_obs) > 0) {
warning(paste(capture.output({
cat("These cohorts have no observations within the following
intervals and will be excluded\n\n")
no_obs %>%
dplyr::select(variable, cohort) %>%
print(n = Inf)
}), collapse = "\n"))
}
valid_obs <- disclosure %>% dplyr::filter(enough_obs == TRUE)
## We then use these vectors to summarise mean observed height at the age
## periods we are interested in.
obs_by_agecat_comb <- valid_obs %>%
pmap(function(variable, cohort, ...) {
ds.meanSdGp(
x = paste0(df, "$", outcome),
y = variable,
type = "split",
datasources = conns[cohort]
)
})
## Now we take these values and put them into a neater table
out <- obs_by_agecat_comb %>%
map(function(x) {
tibble(
group = dimnames(x$Mean_gp_study)[[1]],
mean = as.numeric(x$Mean_gp_study),
std.dev = as.numeric(x$StDev_gp_study),
nvalid = as.numeric(x$Nvalid_gp_study),
cohort = colnames(x$Mean_gp_study)
)
}) %>%
bind_rows() %>%
separate(group, into = c("group", "level"), sep = "_(?=[^_]+$)") %>%
pivot_wider(
names_from = "level",
values_from = c("nvalid", "mean", "std.dev")
) %>%
dplyr::rename(
"nvalid" = nvalid_2,
"nmissing" = nvalid_1,
"mean" = mean_2,
"std.dev" = std.dev_2
) %>%
mutate(group = str_remove(group, "grp_")) %>%
dplyr::select(cohort, group, mean, std.dev, nvalid, nmissing)
## ---- Remove temporary objects -------------------------------------------
dh.tidyEnv(
obj = c(cats$new_df_name, assign_conditions$varname),
type = "remove",
conns = conns
)
}
return(out)
}
#' Internal function to check disclosure issues in this function
#'
#' @param mean_gp output from ds.meanSdGp
#'
#' @importFrom purrr map
#' @importFrom tibble tibble
#' @importFrom dplyr %>% bind_rows filter
#' @noRd
.checkDisclosureMeanGp <- function(mean_gp) {
warnings <- mean_gp %>%
map(function(x) {
tibble(warning = !is.null(x$Warning))
}) %>%
bind_rows(.id = "cohort")
issues <- warnings %>% dplyr::filter(warning == TRUE)
no_issues <- mean_gp[!names(mean_gp) %in% issues$cohort]
return(list(
summary = warnings,
issues = issues,
no_issues_data = no_issues
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.