Nothing
#' Extract information of the sample contained in a data set
#'
#' Given a data set and grouping variables, this function returns mean values
#' for numeric variables and modus for characters and factors. Usually
#' this function should not be called directly but will rather be called
#' as part of a call to \code{make_newdata}.
#'
#' @rdname sample_info
#' @param x A data frame (or object that inherits from \code{data.frame}).
#' @importFrom stats median
#' @return A data frame containing sample information (for each group).
#' If applied to an object of class \code{ped}, the sample means of the
#' original data is returned.
#' Note: When applied to a \code{ped} object, that doesn't contain covariates
#' (only interval information), returns data frame with 0 columns.
#'
#' @export
#' @keywords internal
sample_info <- function(x) {
UseMethod("sample_info", x)
}
#' @import checkmate dplyr
#' @importFrom purrr compose
#' @export
#' @rdname sample_info
sample_info.data.frame <- function(x) {
cn <- colnames(x)
num <- summarize_if (x, .predicate = is.numeric, ~mean(., na.rm = TRUE))
fac <- summarize_if (x, .predicate = compose("!", is.numeric), modus)
nnames <- intersect(names(num), names(fac))
if (length(nnames) != 0) {
suppressMessages(
x <- left_join(num, fac) %>% group_by(!!!lapply(nnames, as.name))
)
} else {
x <- bind_cols(num, fac)
}
return(select(x, one_of(cn)))
}
#' @rdname sample_info
#' @import checkmate dplyr
#' @importFrom rlang sym
#' @export
sample_info.ped <- function(x) {
# is.grouped_df
# remove "noise" information on interval variables
grps <- group_vars(x)
iv <- attr(x, "intvars")
id_var <- attr(x, "id_var")
x <- x %>%
group_by(!!sym(id_var)) %>%
slice(1) %>%
ungroup() %>%
grouped_df(grps) %>%
select(-one_of(iv))
if (test_data_frame(x, min.rows = 1, min.cols = 1)) {
sample_info.data.frame(x)
} else {
NULL
}
}
#' @rdname sample_info
#' @export
sample_info.fped <- function(x) {
x %>% select_if(~!is.matrix(.x)) %>% sample_info.ped()
}
#' Create a data frame from all combinations of data frames
#'
#' Works like \code{\link[base]{expand.grid}} but for data frames.
#'
#' @importFrom dplyr slice bind_cols
#' @importFrom vctrs vec_c
#' @importFrom purrr map map_lgl
#' @importFrom checkmate test_data_frame
#' @param ... Data frames that should be combined to one data frame.
#' Elements of first df vary fastest, elements of last df vary slowest.
#' @examples
#' combine_df(
#' data.frame(x=1:3, y=3:1),
#' data.frame(x1=c("a", "b"), x2=c("c", "d")),
#' data.frame(z=c(0, 1)))
#' @export
#' @keywords internal
combine_df <- function(...) {
dots <- list(...)
if (!all(sapply(dots, test_data_frame))) {
stop("All elements in ... must inherit from data.frame!")
}
ind_seq <- map(dots, ~ seq_len(nrow(.x)))
not_empty <- map_lgl(ind_seq, ~ length(.x) > 0)
ord <- lapply(dots[not_empty], function(z) colnames(z)) |> unlist()
out <- do.call(expand_grid, rev(dots[not_empty]))
out <- out[, ord]
}
#' Construct a data frame suitable for prediction
#'
#' This functions provides a flexible interface to create a data set that
#' can be plugged in as \code{newdata} argument to a suitable \code{predict}
#' function (or similar).
#' The function is particularly useful in combination with one of the
#' \code{add_*} functions, e.g., \code{\link[pammtools]{add_term}},
#' \code{\link[pammtools]{add_hazard}}, etc.
#'
#' @rdname newdata
#' @aliases make_newdata
#' @importFrom tidyr expand_grid
#' @inheritParams sample_info
#' @param ... Covariate specifications (expressions) that will be evaluated
#' by looking for variables in \code{x}. Must be of the form \code{z = f(z)}
#' where \code{z} is a variable in the data set and \code{f} a known
#' function that can be usefully applied to \code{z}. Note that this is also
#' necessary for single value specifications (e.g. \code{age = c(50)}).
#' For data in PED (piece-wise exponential data) format, one can also specify
#' the time argument, but see "Details" an "Examples" below.
#' @import dplyr
#' @importFrom checkmate assert_data_frame assert_character
#' @importFrom purrr map
#' @importFrom tidyr expand_grid
#' @details Depending on the type of variables in \code{x}, mean or modus values
#' will be used for variables not specified in ellipsis
#' (see also \code{\link[pammtools]{sample_info}}). If \code{x} is an object
#' that inherits from class \code{ped}, useful data set completion will be
#' attempted depending on variables specified in ellipsis. This is especially
#' useful, when creating a data set with different time points, e.g. to
#' calculate survival probabilities over time (\code{\link[pammtools]{add_surv_prob}})
#' or to calculate a time-varying covariate effects (\code{\link[pammtools]{add_term}}).
#' To do so, the time variable has to be specified in \code{...}, e.g.,
#' \code{tend = seq_range(tend, 20)}. The problem with this specification is that
#' not all values produced by \code{seq_range(tend, 20)} will be actual values
#' of \code{tend} used at the stage of estimation (and in general, it will
#' often be tedious to specify exact \code{tend} values). \code{make_newdata}
#' therefore finds the correct interval and sets \code{tend} to the respective
#' interval endpoint. For example, if the intervals of the PED object are
#' \eqn{(0,1], (1,2]} then \code{tend = 1.5} will be set to \code{2} and the
#' remaining time-varying information (e.g. offset) completed accordingly.
#' See examples below.
#' @examples
#' # General functionality
#' tumor %>% make_newdata()
#' tumor %>% make_newdata(age=c(50))
#' tumor %>% make_newdata(days=seq_range(days, 3), age=c(50, 55))
#' tumor %>% make_newdata(days=seq_range(days, 3), status=unique(status), age=c(50, 55))
#' # mean/modus values of unspecified variables are calculated over whole data
#' tumor %>% make_newdata(sex=unique(sex))
#' tumor %>% group_by(sex) %>% make_newdata()
#'
#' # Examples for PED data
#' ped <- tumor %>% slice(1:3) %>% as_ped(Surv(days, status)~., cut = c(0, 500, 1000))
#' ped %>% make_newdata(age=c(50, 55))
#'
#' # if time information is specified, other time variables will be specified
#' # accordingly and offset calculated correctly
#' ped %>% make_newdata(tend = c(1000), age = c(50, 55))
#' ped %>% make_newdata(tend = unique(tend))
#' ped %>% group_by(sex) %>% make_newdata(tend = unique(tend))
#'
#' # tend is set to the end point of respective interval:
#' ped <- tumor %>% as_ped(Surv(days, status)~.)
#' seq_range(ped$tend, 3)
#' make_newdata(ped, tend = seq_range(tend, 3))
#' @export
make_newdata <- function(x, ...) {
UseMethod("make_newdata", x)
}
#' @inherit make_newdata
#' @rdname newdata
#' @export
make_newdata.default <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
orig_names <- names(x)
expressions <- quos(...)
expr_evaluated <- map(expressions, lazyeval::f_eval, data = x) |>
map(c)
# construct data parts depending on input type
lgl_atomic <- map_lgl(expr_evaluated, is_atomic)
# part1 <- expr_evaluated[lgl_atomic] |> cross_df()
part1 <- do.call(tidyr::expand_grid, rev(expr_evaluated[lgl_atomic]))
part2 <- do.call(combine_df, expr_evaluated[!lgl_atomic])
ndf <- combine_df(part1, part2)
rest <- x %>% select(-one_of(c(colnames(ndf))))
if (ncol(rest) > 0) {
si <- sample_info(rest) %>% ungroup()
ndf <- combine_df(si, ndf)
}
ndf %>% select(one_of(orig_names))
}
#' @rdname newdata
#' @inherit make_newdata.default
#' @export
make_newdata.ped <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
# prediction time points have to be interval end points so that piece-wise
# constancy of predicted hazards is respected. If user overrides this, warn.
orig_vars <- names(x)
int_df <- int_info(x)
expressions <- quos(...)
dot_names <- names(expressions)
int_names <- names(int_df)
# x <- select(x, -one_of(setdiff(int_names, c(dot_names, "intlen", "intmid"))))
ndf <- x %>%
select(-one_of(setdiff(int_names, c(dot_names, "intlen", "intmid")))) %>%
unped() %>%
make_newdata(...)
if (any(names(int_df) %in% names(ndf))) {
int_tend <- get_intervals(x, ndf$tend)$tend
if (!all(ndf$tend == int_tend)) {
message("Some values of 'tend' have been set to the respective interval end-points")
}
ndf$tend <- int_tend
suppressMessages(
ndf <- ndf %>% left_join(int_df)
)
} else {
ndf <- combine_df(int_df[1, ], ndf)
}
int_names <- intersect(int_names, c("intlen", orig_vars))
ndf %>% select(one_of(c(int_names, setdiff(orig_vars, int_names)))) %>%
mutate(
intlen = .data$tend - .data$tstart,
offset = log(.data$tend - .data$tstart),
ped_status = 0)
}
#' @rdname newdata
#' @inherit make_newdata.ped
#' @importFrom rlang quos
#' @export
make_newdata.fped <- function(x, ...) {
assert_data_frame(x, all.missing = FALSE, min.rows = 2, min.cols = 1)
# prediction time points have to be interval end points so that piece-wise
# constancy of predicted hazards is respected. If user overrides this, warn.
expressions <- quos(...)
dot_names <- names(expressions)
orig_vars <- names(x)
cumu_vars <- setdiff(unlist(attr(x, "func_mat_names")), dot_names)
cumu_smry <- smry_cumu_vars(x, attr(x, "time_var")) %>%
select(one_of(cumu_vars))
int_names <- attr(x, "intvars")
ndf <- x %>%
select(one_of(setdiff(names(x), cumu_vars))) %>%
unfped() %>%
make_newdata(...)
out_df <- do.call(combine_df, compact(list(cumu_smry, ndf)))
int_df <- int_info(attr(x, "breaks"))
suppressMessages(
out_df <- out_df %>% left_join(int_df) %>%
select(-one_of(c("intmid"))) %>% as_tibble()
)
# adjust lag-lead indicator
out_df <- adjust_ll(out_df, x)
out_df
}
smry_cumu_vars <- function(data, time_var) {
cumu_vars <- unlist(attr(data, "func_mat_names"))
func_list <- attr(data, "func")
z_vars <- map(func_list, ~get_zvars(.x, time_var, length(func_list))) %>%
unlist()
smry_z <- select(data, one_of(z_vars)) %>%
map(~ .x[1, ]) %>% map(~mean(unlist(.x))) %>% bind_cols()
smry_time <- select(data, setdiff(cumu_vars, z_vars)) %>% map(~.x[1, 1])
bind_cols(smry_z, smry_time)
}
get_zvars <- function(func, time_var, n_func) {
col_vars <- func$col_vars
all_vars <- make_mat_names(c(col_vars, "LL"), func$latency_var, func$tz_var,
func$suffix, n_func)
time_vars <- make_mat_names(c(time_var, func$tz_var, "LL"),
func$latency_var, func$tz_var, func$suffix, n_func)
setdiff(all_vars, time_vars)
}
## apply ll_fun to newly created data
adjust_ll <- function(out_df, data) {
func_list <- attr(data, "func")
n_func <- length(func_list)
LL_names <- grep("LL", unlist(attr(data, "func_mat_names")), value = TRUE)
for (i in LL_names) {
ind_ll <- map_lgl(names(attr(data, "ll_funs")), ~grepl(.x, i))
if (any(ind_ll)) {
ind_ll <- which(ind_ll)
} else {
ind_ll <- 1
}
func <- func_list[[ind_ll]]
ll_i <- attr(data, "ll_funs")[[ind_ll]]
tz_var <- attr(data, "tz_vars")[[ind_ll]]
tz_var <- make_mat_names(tz_var, func$latency_var, func$tz_var, func$suffix,
n_func)
if (func$latency_var == "") {
out_df[[i]] <- ll_i(out_df[["tend"]], out_df[[tz_var]]) * 1L
} else {
out_df[[i]] <- ll_i(out_df[["tend"]], out_df[["tend"]] -
out_df[[tz_var]]) * 1L
}
}
out_df
}
# All variables that represent follow-up time should have the same values
# adjust_time_vars <- function(out_df, data, dot_names) {
# time_vars <- c("tend",
# grep(attr(data, "time_var"), unlist(attr(data, "func_mat_names")), value=TRUE))
# time_vars_dots <- c(grep("tend", dot_names, value=TRUE),
# grep(attr(data, "time_var"), dot_names, value=TRUE))
# if (length(time_vars_dots) == 0) {
# time_vars_dots <- "tend"
# } else {
# if (length(time_vars_dots) > 1) {
# warning(paste0("Only one of ", paste0(time_vars_dots, collapse=", "),
# "must be specified. Only the first one will be used!"))
# time_vars_dots <- time_vars_dots[1]
# }
# }
# for (i in setdiff(time_vars, time_vars_dots)) {
# out_df[[i]] <- out_df[[time_vars_dots]]
# }
# out_df
# }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.