Nothing
#' Performs checks to gather a list of guesses and iterations
#'
#' \code{Gather_Guesses_CPP} called from within R, uses a list of options and the model definition to generate a list of parameters and iterations that do not produce errors
#'
#' @inheritParams R_template
#' @param a_n_default center of parameter distribution guessing scope
#' @param dfc vector matching subterm number to matrix column
#' @param x_all covariate matrix
#'
#' @return returns a list of the final results
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 0, 0, 0, 1, 0)
#' d <- c(3, 4, 5, 6, 7, 8, 9)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c, "d" = d)
#' time1 <- "a"
#' time2 <- "b"
#' event <- "c"
#' names <- c("d")
#' term_n <- c(0)
#' tform <- c("loglin")
#' keep_constant <- c(0)
#' a_n <- c(-0.1)
#' a_n_default <- a_n
#' modelform <- "M"
#' control <- list(
#' "ncores" = 2, "lr" = 0.75, "maxiter" = -1,
#' "halfmax" = 5, "epsilon" = 1e-9,
#' "deriv_epsilon" = 1e-9, "abs_max" = 1.0,
#' "dose_abs_max" = 100.0, "verbose" = FALSE, "ties" = "breslow",
#' "double_step" = 1
#' )
#' guesses_control <- list()
#' model_control <- list()
#' all_names <- unique(names(df))
#' dfc <- match(names, all_names)
#' term_tot <- max(term_n) + 1
#' x_all <- as.matrix(df[, all_names, with = FALSE])
#' control <- Def_Control(control)
#' guesses_control <- Def_Control_Guess(guesses_control, a_n)
#' model_control <- Def_model_control(model_control)
#' options(warn = -1)
#' Gather_Guesses_CPP(
#' df, dfc, names, term_n, tform, keep_constant,
#' a_n, x_all, a_n_default,
#' modelform, control, guesses_control
#' )
#' @importFrom rlang .data
Gather_Guesses_CPP <- function(df, dfc, names, term_n, tform, keep_constant, a_n, x_all, a_n_default, modelform, control, guesses_control, model_control = list()) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
if (typeof(a_n) != "list") {
a_n <- list(a_n)
}
term_tot <- max(term_n) + 1
a_ns <- c(NaN)
maxiters <- c(NaN)
model_control <- Def_model_control(model_control)
val <- Def_modelform_fix(control, model_control, modelform, term_n)
modelform <- val$modelform
model_control <- val$model_control
val <- Correct_Formula_Order(
term_n, tform, keep_constant, a_n_default,
names
)
term_n <- val$term_n
tform <- val$tform
keep_constant <- val$keep_constant
a_n_default <- val$a_n
names <- val$names
if (length(a_n_default) != length(names)) {
stop(paste("Error: Default Parameters used: ", length(a_n_default),
", Covariates used: ", length(names),
sep = ""
))
}
for (i in seq_along(tform)) {
if (grepl("_int", tform[i], fixed = FALSE)) {
# fine
} else if (grepl("lin_exp_exp_slope", tform[i], fixed = FALSE)) {
# fine
} else if (grepl("_slope", tform[i], fixed = FALSE)) {
# fine
} else if (grepl("loglin", tform[i], fixed = FALSE)) {
# fine
} else if (grepl("lin", tform[i], fixed = FALSE)) {
# fine
} else {
stop(paste("Error: tform not implemented ", tform[i], sep = ""))
}
}
for (i in seq_len(length(a_n))) {
a_n0 <- a_n[[i]]
if (length(a_n0) != length(names)) {
stop(paste("Error: Parameters used: ", length(a_n0),
", Covariates used: ", length(names),
sep = ""
))
}
if (length(term_n) < length(names)) {
stop(paste("Error: Terms used: ", length(term_n),
", Covariates used: ", length(names),
sep = ""
))
} else if (length(term_n) > length(names)) {
stop(paste("Error: Terms used: ", length(term_n),
", Covariates used: ", length(names),
sep = ""
))
}
if (length(tform) < length(names)) {
stop(paste("Error: Term types used: ", length(tform),
", Covariates used: ", length(names),
sep = ""
))
} else if (length(tform) > length(names)) {
stop(paste("Error: Term types used: ", length(tform),
", Covariates used: ", length(names),
sep = ""
))
}
if (length(keep_constant) < length(names)) {
keep_constant <- c(keep_constant, rep(
0.0,
length(names) - length(keep_constant)
))
} else if (length(keep_constant) > length(names)) {
keep_constant <- keep_constant[seq_len(length(names))]
}
rmin <- guesses_control$rmin
rmax <- guesses_control$rmax
if (length(rmin) != length(rmax)) {
warning("Warning: rmin and rmax lists not equal size, defaulting to lin and loglin min/max values")
}
keep <- risk_check_transition(
term_n, tform, a_n0, dfc, x_all,
0, modelform,
control, model_control,
keep_constant, term_tot
)
if (keep) {
if (is.nan(maxiters[1])) {
a_ns <- c(a_n0)
maxiters <- c(guesses_control$maxiter)
} else {
a_ns <- c(a_ns, a_n0)
maxiters <- c(maxiters, guesses_control$maxiter)
}
}
}
while (length(maxiters) - length(a_n) < guesses_control$guesses - 1) {
if (guesses_control$verbose >= 3) {
message(paste("Note: ", length(maxiters), " valid guesses",
sep = ""
)) # nocov
}
if (length(rmin) != length(rmax)) {
for (i in seq_along(tform)) {
if (guesses_control$guess_constant[i] == 0) {
if (grepl("_int", tform[i], fixed = FALSE)) {
a_n0[i] <- runif(1,
min = guesses_control$intercept_min,
max = guesses_control$intercept_max
) +
a_n_default[i]
} else if (grepl("lin_exp_exp_slope", tform[i],
fixed = FALSE
)) {
a_n0[i] <- runif(1,
min = guesses_control$exp_slope_min,
max = guesses_control$exp_slope_max
) +
a_n_default[i]
} else if (grepl("_slope", tform[i], fixed = FALSE)) {
a_n0[i] <- runif(1,
min = guesses_control$lin_min,
max = guesses_control$lin_max
) +
a_n_default[i]
} else if (grepl("loglin", tform[i], fixed = FALSE)) {
a_n0[i] <- runif(1,
min = guesses_control$exp_min,
max = guesses_control$exp_max
) +
a_n_default[i]
} else if (grepl("lin", tform[i], fixed = FALSE)) {
a_n0[i] <- runif(1,
min = guesses_control$lin_min,
max = guesses_control$lin_max
) +
a_n_default[i]
} else {
stop(paste("Error: tform not implemented ", tform[i],
sep = ""
))
}
} else {
a_n0[i] <- a_n_default[i]
}
}
} else {
for (i in seq_along(tform)) {
if (guesses_control$guess_constant[i] == 0) {
a_n0[i] <- runif(1,
min = guesses_control$rmin[i],
max = guesses_control$rmax[i]
) +
a_n_default[i]
} else {
a_n0[i] <- a_n_default[i]
}
}
}
keep <- risk_check_transition(
term_n, tform, a_n0, dfc, x_all,
0, modelform, control, model_control,
keep_constant, term_tot
)
if (keep) {
if (is.nan(maxiters[1])) {
a_ns <- c(a_n0)
maxiters <- c(guesses_control$maxiter)
} else {
a_ns <- c(a_ns, a_n0)
maxiters <- c(maxiters, guesses_control$maxiter)
}
}
}
list("a_ns" = a_ns, "maxiters" = maxiters)
}
#' Corrects the order of terms/formula/etc
#'
#' \code{Correct_Formula_Order} checks the order of formulas given and corrects any ordering issues, orders alphabetically, by term number, etc.
#'
#' @inheritParams R_template
#'
#' @return returns the corrected lists
#' @export
#' @family Data Cleaning Functions
#' @examples
#' library(data.table)
#' ## basic example code reproduced from the starting-description vignette
#' term_n <- c(0, 1, 1, 0, 0)
#' tform <- c("loglin", "quad_slope", "lin", "lin_int", "lin_slope")
#' keep_constant <- c(0, 0, 0, 1, 0)
#' a_n <- c(1, 2, 3, 4, 5)
#' names <- c("a", "a", "a", "a", "a")
#' val <- Correct_Formula_Order(term_n, tform, keep_constant,
#' a_n, names,
#' cons_mat = matrix(c(0)),
#' cons_vec = c(0)
#' )
#' term_n <- val$term_n
#' tform <- val$tform
#' keep_constant <- val$keep_constant
#' a_n <- val$a_n
#' names <- val$names
#'
Correct_Formula_Order <- function(term_n, tform, keep_constant, a_n, names, cons_mat = matrix(c(0)), cons_vec = c(0), verbose = FALSE, model_control = list()) {
#
verbose <- Check_Verbose(verbose)
if ("para_number" %in% names(model_control)) {
# pass
} else {
model_control["para_number"] <- 0
}
if (is.matrix(cons_mat)) {
# pass
} else {
cons_mat <- as.matrix(cons_mat)
warning("Warning: Constraint Matrix was not a matrix, converted")
}
#
if (min(keep_constant) < 0) {
stop(paste("Error: keep_constant expects 0/1 values, minimum value was ",
min(keep_constant),
sep = ""
))
}
if (max(keep_constant) > 1) {
stop(paste("Error: keep_constant expects 0/1 values, maximum value was ",
max(keep_constant),
sep = ""
))
}
if (any(keep_constant != round(keep_constant))) {
stop(paste("Error: keep_constant expects 0/1 values, atleast one value was noninteger",
sep = ""
))
}
if (any(term_n != round(term_n))) {
stop(paste("Error: term_n expects integer values, atleast one value was noninteger",
sep = ""
))
}
if (min(term_n) != 0) {
if (verbose >= 2) {
warning(paste("Warning: term_n expects nonnegative integer values and a minimum of 0, minimum value was ",
min(term_n),
". Minimum value set to 0, others shifted by ",
-1 * min(term_n),
sep = ""
))
}
term_n <- term_n - min(term_n)
}
if (length(sort(unique(term_n))) != length(min(term_n):max(term_n))) {
stop(paste("Error: term_n expects no missing integer values. Term numbers range from ", min(term_n),
" to ", max(term_n), " but term_n has ",
length(unique(term_n)), " unique values instead of ",
length(min(term_n):max(term_n)),
sep = ""
))
}
if (length(keep_constant) < length(names)) {
keep_constant <- c(keep_constant, rep(0.0, length(names) -
length(keep_constant)))
} else if (length(keep_constant) > length(names)) {
keep_constant <- keep_constant[seq_len(length(names))]
}
if (length(term_n) < length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Terms used: ", length(term_n),
", Covariates used: ", length(names),
sep = ""
))
}
term_n <- c(term_n, rep(0, length(names) -
length(term_n)))
} else if (length(term_n) > length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Terms used: ", length(term_n),
", Covariates used: ", length(names),
sep = ""
))
}
term_n <- term_n[seq_len(length(names))]
}
if (length(tform) < length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Term types used: ", length(tform),
", Covariates used: ", length(names),
sep = ""
))
}
tform <- c(tform, rep("loglin", length(names) -
length(tform)))
} else if (length(tform) > length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Term types used: ", length(tform),
", Covariates used: ", length(names),
sep = ""
))
}
tform <- tform[seq_len(length(names))]
}
col_to_cons <- c()
for (i in keep_constant) {
if (i == 0) {
if (length(col_to_cons) == 0) {
col_to_cons <- c(1)
} else {
col_to_cons <- c(col_to_cons, max(col_to_cons) + 1)
}
} else {
col_to_cons <- c(col_to_cons, 0)
}
}
if (ncol(cons_mat) > 1) {
if (ncol(cons_mat) != (length(keep_constant) - sum(keep_constant))) {
stop("Error: Constraint matrix has incorrect number of columns")
}
if (nrow(cons_mat) != length(cons_vec)) {
stop("Error: Constraint rows and constant lengths do not match")
}
}
if (min(keep_constant) > 0) {
stop("Error: Atleast one parameter must be free")
}
tform <- tolower(tform)
for (i in 1:length(tform)) {
if (tform[i] %in% c("plin", "plinear", "product-linear")) {
tform[i] <- "plin"
} else if (tform[i] %in% c("lin", "linear")) {
tform[i] <- "lin"
} else if (tform[i] %in% c("loglin", "loglinear", "log-linear")) {
tform[i] <- "loglin"
}
}
tform_order <- c(
"loglin", "lin", "plin", "loglin_slope", "loglin_top",
"lin_slope", "lin_int", "quad_slope",
"step_slope", "step_int",
"lin_quad_slope", "lin_quad_int", "lin_exp_slope",
"lin_exp_int", "lin_exp_exp_slope"
)
tform_iden <- match(tform, tform_order)
if (any(is.na(tform_iden))) {
stop(paste("Missing tform option ", tform[is.na(tform_iden)],
", ",
sep = ""
))
}
if (((typeof(a_n) == "list") && (length(a_n) == 1)) || (typeof(a_n) != "list")) {
if (typeof(a_n) == "list") {
a_n <- a_n[[1]]
}
if (length(a_n) < length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Parameters used: ",
length(a_n), ", Covariates used: ",
length(names), ", Remaining filled with 0.01",
sep = ""
))
}
a_n <- c(a_n, rep(0.01, length(names) - length(a_n)))
} else if (length(a_n) > length(names)) {
stop(paste("Error: Parameters used: ", length(a_n),
", Covariates used: ", length(names),
sep = ""
))
}
df <- data.table::data.table(
"term_n" = term_n, "tform" = tform,
"keep_constant" = keep_constant,
"a_n" = a_n, "names" = names,
"iden_const" = rep(0, length(names)),
"para_num" = rep(0, length(names)),
"current_order" = seq_len(length(tform)),
"constraint_order" = col_to_cons
)
df$para_num[[model_control[["para_number"]] + 1]] <- 1
df$tform_order <- tform_iden
keycol <- c("term_n", "names", "tform_order")
data.table::setorderv(df, keycol)
a_n <- df$a_n
} else {
a_0 <- a_n[[1]]
for (a_i in a_n) {
if (length(a_i) != length(a_0)) {
stop(paste("Error: Parameters used in first option: ",
length(a_0),
", Parameters used in different option: ",
length(a_i),
", please fix parameter length",
sep = ""
))
}
}
if (length(a_0) < length(names)) {
if (verbose >= 2) {
warning(paste("Warning: Parameters used: ", length(a_0),
", Covariates used: ", length(names),
", Remaining filled with 0.01",
sep = ""
))
}
for (i in seq_len(length(a_n))) {
a_n[[i]] <- c(
a_n[[i]],
rep(0.01, length(names) - length(a_n[[i]]))
)
}
} else if (length(a_0) > length(names)) {
stop(paste("Error: Parameters used: ", length(a_0),
", Covariates used: ", length(names),
sep = ""
))
}
df <- data.table::data.table(
"term_n" = term_n, "tform" = tform,
"keep_constant" = keep_constant,
"names" = names, "iden_const" = rep(0, length(names)),
"para_num" = rep(0, length(names)),
"current_order" = seq_along(tform),
"constraint_order" = col_to_cons
)
df$para_num[[model_control[["para_number"]] + 1]] <- 1
for (i in seq_len(length(a_n))) {
df[[paste("a_", i, sep = "")]] <- a_n[[i]]
}
df$tform_order <- tform_iden
keycol <- c("term_n", "names", "tform_order")
data.table::setorderv(df, keycol)
for (i in seq_len(length(a_n))) {
a_n[[i]] <- df[[paste("a_", i, sep = "")]]
}
}
a <- df$tform
for (i in seq_len(length(a))) {
if (i < length(a)) {
if ((a[i] == "loglin_slope")) {
if (a[i + 1] != "loglin_top") {
stop("Error: loglin_top missing")
}
} else if ((a[i] == "lin_slope")) {
if (a[i + 1] != "lin_int") {
stop("Error: lin_int missing")
}
} else if ((a[i] == "step_slope")) {
if (a[i + 1] != "step_int") {
stop("Error: step_int missing")
}
} else if ((a[i] == "lin_quad_slope")) {
if (a[i + 1] != "lin_quad_int") {
stop("Error: lin_quad_int missing")
}
} else if ((a[i] == "lin_exp_slope")) {
if (a[i + 1] != "lin_exp_int") {
stop("Error: lin_exp_int missing")
}
} else if ((a[i] == "lin_exp_int")) {
if (a[i + 1] != "lin_exp_exp_slope") {
stop("Error: lin_exp_exp_slope missing")
}
}
}
if (i > 1) {
if ((a[i] == "lin_int")) {
if (a[i - 1] != "lin_slope") {
stop("Error: lin_slope missing")
}
} else if ((a[i] == "step_int")) {
if (a[i - 1] != "step_slope") {
stop("Error: step_slope missing")
}
} else if ((a[i] == "lin_quad_int")) {
if (a[i - 1] != "lin_quad_slope") {
stop("Error: lin_quad_slope missing")
}
} else if ((a[i] == "lin_exp_int")) {
if (a[i - 1] != "lin_exp_slope") {
stop("Error: lin_exp_slope missing")
}
} else if ((a[i] == "lin_exp_exp_slope")) {
if (a[i - 1] != "lin_exp_int") {
stop("Error: lin_exp_int missing")
}
}
}
}
col_to_cons <- df$constraint_order
cons_order <- c()
for (i in col_to_cons) {
if (i > 0) {
cons_order <- c(cons_order, i)
}
}
if (ncol(cons_mat) > 1) {
colnames(cons_mat) <- seq_len(ncol(cons_mat))
r0 <- nrow(cons_mat)
c0 <- ncol(cons_mat)
cons_mat <- as.matrix(cons_mat[, cons_order])
if ((nrow(cons_mat) == r0) && (ncol(cons_mat) == c0)) {
# all good
} else if ((nrow(cons_mat) == c0) && (ncol(cons_mat) == r0)) {
cons_mat <- t(cons_mat)
} else {
stop("Matrix reordering failed") # nocov
}
}
a_temp <- df$iden_const
b_temp <- df$para_num
para_num <- which(b_temp == 1) - 1
list(
"term_n" = df$term_n, "tform" = df$tform, "keep_constant" = df$keep_constant,
"a_n" = a_n, "names" = df$names,
"Permutation" = df$current_order,
"cons_mat" = unname(cons_mat), "cons_vec" = cons_vec,
"para_num" = para_num
)
}
#' Automatically assigns missing values in listed columns
#'
#' \code{Replace_Missing} checks each column and fills in NA values
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a filled datatable
#' @export
#' @examples
#' library(data.table)
#' ## basic example code reproduced from the starting-description vignette
#' df <- data.table::data.table(
#' "UserID" = c(112, 114, 213, 214, 115, 116, 117),
#' "Starting_Age" = c(18, 20, 18, 19, 21, 20, 18),
#' "Ending_Age" = c(30, 45, NA, 47, 36, NA, 55),
#' "Cancer_Status" = c(0, 0, 1, 0, 1, 0, 0)
#' )
#' df <- Replace_Missing(df, c("Starting_Age", "Ending_Age"), 70)
Replace_Missing <- function(df, name_list, msv, verbose = FALSE) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
if (is.na(msv)) {
stop("Error: The missing-value replacement is also NA")
}
for (j in name_list) {
#
if (j %in% names(df)) {
# fine
} else {
stop(paste("Error: ", j, " missing from column names", sep = ""))
}
if (sum(is.na(df[[j]]))) {
data.table::set(df, which(is.na(df[[j]])), j, msv)
if (verbose >= 3) {
message(paste("Note: Column ", j, " had replaced values",
sep = ""
)) # nocov
}
}
}
return(df)
}
#' Automatically assigns missing control values
#'
#' \code{Def_Control} checks and assigns default values
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a filled list
#' @export
#' @examples
#' library(data.table)
#' control <- list(
#' "ncores" = 2, "lr" = 0.75, "maxiter" = 5,
#' "ties" = "breslow", "double_step" = 1
#' )
#' control <- Def_Control(control)
#'
Def_Control <- function(control) {
control_def <- list(
"verbose" = 0, "lr" = 0.75, "maxiter" = 20,
"halfmax" = 5, "epsilon" = 1e-4,
"deriv_epsilon" = 1e-4, "abs_max" = 1.0,
"change_all" = TRUE, "dose_abs_max" = 100.0,
"ties" = "breslow", "double_step" = 1,
"ncores" = as.numeric(detectCores())
)
names(control) <- tolower(names(control))
if ((identical(Sys.getenv("TESTTHAT"), "true")) || (identical(Sys.getenv("TESTTHAT_IS_CHECKING"), "true"))) {
control_def$ncores <- min(c(2, as.numeric(detectCores())))
}
sys_config <- System_Version()
OpenMP <- sys_config[["OpenMP Enabled"]]
os <- sys_config[["Operating System"]]
cpp_compiler <- sys_config[["Default c++"]]
R_compiler <- sys_config[["R Compiler"]]
if (!OpenMP) {
warning("Warning: OpenMP not detected, cores set to 1")
control$ncores <- 1 # nocov
}
if (Sys.getenv("R_COLOSSUS_NOT_CRAN") == "") {
if (os == "linux") {
if (cpp_compiler == "gcc") {
if (R_compiler != "gcc") { # nocov
control$ncores <- 1 # nocov
warning("Warning: linux machine not using gcc, cores set to 1. Set R_COLOSSUS_NOT_CRAN environemnt variable to skip check")
}
} else if (cpp_compiler == "clang") { # nocov
control$ncores <- 1 # nocov
warning("Warning: linux machine using clang, cores set to 1. Set R_COLOSSUS_NOT_CRAN environemnt variable to skip check")
}
}
}
for (nm in names(control_def)) {
if (nm %in% names(control)) {
if (nm == "ncores") {
if (control$ncores > control_def$ncores) {
stop(paste("Error: Cores Requested:", control["ncores"],
", Cores Available:", control_def["ncores"],
sep = " "
)) # nocov
}
} else if (nm == "verbose") {
control$verbose <- Check_Verbose(control$verbose)
}
} else {
control[nm] <- control_def[nm]
}
}
control["ties"] <- tolower(control["ties"])
control_min <- list(
"verbose" = 0, "lr" = 0.0, "maxiter" = -1,
"halfmax" = 0, "epsilon" = 0.0,
"deriv_epsilon" = 0.0, "abs_max" = 0.0,
"dose_abs_max" = 0.0
)
for (nm in names(control_min)) {
if (control[[nm]] < control_min[[nm]]) {
control[nm] <- control_min[nm]
}
}
control_int <- list(
"verbose" = 0, "maxiter" = -1,
"halfmax" = 0
)
for (nm in names(control_int)) {
control[nm] <- as.integer(control[nm])
}
return(control)
}
#' Automatically assigns geometric-mixture values and checks that a valid modelform is used
#'
#' \code{Def_modelform_fix} checks and assigns default values for modelform options
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a filled list
#' @export
#' @examples
#' library(data.table)
#' control <- list(
#' "ncores" = 2, "lr" = 0.75, "maxiter" = 5,
#' "ties" = "breslow", "double_step" = 1
#' )
#' control <- Def_Control(control)
#' model_control <- list("single" = TRUE)
#' model_control <- Def_model_control(model_control)
#' term_n <- c(0, 1, 1)
#' modelform <- "a"
#' val <- Def_modelform_fix(control, model_control, modelform, term_n)
#' model_control <- val$model_control
#' modelform <- val$modelform
#'
Def_modelform_fix <- function(control, model_control, modelform, term_n) {
term_tot <- max(term_n) + 1
modelform <- toupper(modelform)
acceptable <- toupper(c(
"A", "PA", "PAE", "M", "ME", "GMIX", "GMIX-R", "GMIX-E",
"multiplicative", "multiplicative-excess", "additive",
"product-additive", "product-additive-excess"
))
if (modelform %in% acceptable) {
if (modelform %in% c("ME", "MULTIPLICATIVE", "MULTIPLICATIVE-EXCESS")) {
modelform <- "M"
} else if (modelform == "ADDITIVE") {
modelform <- "A"
} else if (modelform == "PRODUCT-ADDITIVE") {
modelform <- "PA"
} else if (modelform == "PRODUCT-ADDITIVE-EXCESS") {
modelform <- "PAE"
} else if (modelform == "GMIX-R") {
model_control$gmix_term <- rep(0, term_tot)
modelform <- "GMIX"
} else if (modelform == "GMIX-E") {
model_control$gmix_term <- rep(1, term_tot)
modelform <- "GMIX"
}
} else {
stop(paste("Error: Model formula ", modelform,
" not in acceptable list",
sep = ""
))
}
if (modelform == "GMIX") {
gmix_term <- model_control$gmix_term
if (length(gmix_term) != term_tot) {
stop(paste("Error: Terms used:", term_tot,
", Terms with gmix types available:",
length(gmix_term),
sep = " "
))
}
}
return(list("modelform" = modelform, "model_control" = model_control))
}
#' Automatically assigns missing model control values
#'
#' \code{Def_model_control} checks and assigns default values
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a filled list
#' @export
#' @examples
#' library(data.table)
#' control <- list("single" = TRUE)
#' control <- Def_model_control(control)
#'
Def_model_control <- function(control) {
names(control) <- tolower(names(control))
control_def_names <- c(
"single", "basic", "null", "cr", "linear_err",
"gradient", "constraint", "strata", "surv",
"schoenfeld", "risk",
"risk_subset", "log_bound", "pearson", "deviance",
"mcml", "oberved_info", "time_risk"
)
for (nm in control_def_names) {
if (nm %in% names(control)) {
# fine
} else {
control[nm] <- FALSE
}
}
if (control["null"] == TRUE) {
control["single"] <- TRUE
}
if ("step_size" %in% names(control)) {
# fine
} else {
control["step_size"] <- 0.5
}
if ("unique_values" %in% names(control)) {
# fine
} else {
control["unique_values"] <- 2
}
if ("gmix_theta" %in% names(control)) {
# fine
} else {
control["gmix_theta"] <- 0.5
}
if ("gmix_term" %in% names(control)) {
# fine
} else {
control["gmix_term"] <- c(0)
}
if ("conditional_threshold" %in% names(control)) {
# fine
} else {
control["conditional_threshold"] <- 50
}
if (control[["log_bound"]]) {
if ("qchi" %in% names(control)) {
# fine
} else {
if ("alpha" %in% names(control)) {
control["qchi"] <- qchisq(1 - control[["alpha"]], df = 1) / 2
} else {
control["alpha"] <- 0.05
control["qchi"] <- qchisq(1 - control[["alpha"]], df = 1) / 2
}
}
if ("para_num" %in% names(control)) {
warning("R Warning: para_num detected in model_control, did you mean para_number?")
control["para_number"] <- control["para_num"]
}
if ("para_number" %in% names(control)) {
# fine
} else {
control["para_number"] <- 0
}
if ("maxstep" %in% names(control)) {
# fine
} else {
control["maxstep"] <- 10
}
if ("manual" %in% names(control)) {
# fine
} else {
control["manual"] <- FALSE
}
if ("search_mult" %in% names(control)) {
# fine
} else {
control["search_mult"] <- 1.0
}
if ("step_size" %in% names(control)) {
# fine
} else {
control["search_mult"] <- 0.5
}
}
if (control[["gradient"]]) {
control_def_names <- c(
"momentum", "adadelta", "adam"
)
for (nm in control_def_names) {
if (nm %in% names(control)) {
# fine
} else {
control[nm] <- FALSE
}
}
# add different parameters
if ("momentum_decay" %in% names(control)) {
# fine
} else {
control["momentum_decay"] <- 0.9
}
if ("learning_decay" %in% names(control)) {
# fine
} else {
control["learning_decay"] <- 0.999
}
if ("epsilon_decay" %in% names(control)) {
# fine
} else {
control["epsilon_decay"] <- 1e-4
}
}
return(control)
}
#' Automatically assigns missing guessing control values
#'
#' \code{Def_Control_Guess} checks and assigns default values
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a filled list
#' @export
#' @examples
#' library(data.table)
#' guesses_control <- list(
#' "maxiter" = 10, "guesses" = 10,
#' "loglin_min" = -1, "loglin_max" = 1, "loglin_method" = "uniform"
#' )
#' a_n <- c(0.1, 2, 1.3)
#' guesses_control <- Def_Control_Guess(guesses_control, a_n)
#'
Def_Control_Guess <- function(guesses_control, a_n) {
names(guesses_control) <- tolower(names(guesses_control))
if ("verbose" %in% names(guesses_control)) { # determines extra printing
guesses_control$verbose <- Check_Verbose(guesses_control$verbose)
} else {
guesses_control$verbose <- 0
}
if ("maxiter" %in% names(guesses_control)) { # determines the iterations for each guess
# fine
} else {
guesses_control$maxiter <- 5
}
if ("guesses" %in% names(guesses_control)) { # determines the number of guesses
# fine
} else {
guesses_control$guesses <- 10
}
if ("exp_min" %in% names(guesses_control)) { # minimum exponential parameter change
# fine
} else {
guesses_control$exp_min <- -1
}
if ("exp_max" %in% names(guesses_control)) { # maximum exponential parameter change
# fine
} else {
guesses_control$exp_max <- 1
}
if ("intercept_min" %in% names(guesses_control)) { # minimum intercept parameter change
# fine
} else {
guesses_control$intercept_min <- -1
}
if ("intercept_max" %in% names(guesses_control)) { # maximum intercept parameter change
# fine
} else {
guesses_control$intercept_max <- 1
}
if ("lin_min" %in% names(guesses_control)) { # minimum linear parameter change
# fine
} else {
guesses_control$lin_min <- 0.1
}
if ("lin_max" %in% names(guesses_control)) { # maximum linear parameter change
# fine
} else {
guesses_control$lin_max <- 1
}
if ("exp_slope_min" %in% names(guesses_control)) { # minimum exp_slope parameter change
# fine
} else {
guesses_control$exp_slope_min <- 0.001
}
if ("exp_slope_max" %in% names(guesses_control)) { # maximum exp_slope parameter change
# fine
} else {
guesses_control$exp_slope_max <- 2
}
if ("term_initial" %in% names(guesses_control)) { # list of term numbers for tiered guessing
# fine
} else {
guesses_control$term_initial <- c(0)
}
if ("guess_constant" %in% names(guesses_control)) {
# binary values for if a parameter is distributed (0) or not (1)
if (length(guesses_control$guess_constant) < length(a_n)) {
guesses_control$guess_constant <- c(
guesses_control$guess_constant,
rep(0, length(a_n) - length(guesses_control$guess_constant))
)
}
} else {
guesses_control$guess_constant <- rep(0, length(a_n))
}
if ("guesses_start" %in% names(guesses_control)) {
# number of guesses for first part of tiered guessing
# fine
} else {
guesses_control$guesses_start <- guesses_control$guesses
}
if ("strata" %in% names(guesses_control)) { # if stratification is used
# fine
} else {
guesses_control$strata <- FALSE
}
if (("rmin" %in% names(guesses_control)) && ("rmax" %in% names(guesses_control))) {
# if rmin/rmax used is used
# fine
} else {
# if they are unequal length, the default is used
guesses_control$rmin <- c(-1)
guesses_control$rmax <- c(-1, -1)
}
return(guesses_control)
}
#' Calculates Full Parameter list for Special Dose Formula
#'
#' \code{Linked_Dose_Formula} Calculates all parameters for linear-quadratic and linear-exponential linked formulas
#'
#' @inheritParams R_template
#'
#' @return returns list of full parameters
#' @export
#' @examples
#' library(data.table)
#' tforms <- list("cov_0" = "quad", "cov_1" = "exp")
#' paras <- list("cov_0" = c(1, 3.45), "cov_1" = c(1.2, 4.5, 0.1))
#' full_paras <- Linked_Dose_Formula(tforms, paras)
#'
Linked_Dose_Formula <- function(tforms, paras, verbose = 0) {
verbose <- Check_Verbose(verbose)
full_paras <- list()
for (nm in names(tforms)) {
if (tforms[nm] == "quad") {
plist <- unlist(paras[nm], use.names = FALSE)
a0 <- plist[1]
y <- plist[2]
if (a0 < 0) {
stop("Error: a0 arguement was negative")
}
if (is.numeric(a0)) {
# fine
} else {
stop("Error: a0 arguement was not a number")
}
if (is.numeric(y)) {
# fine
} else {
stop("Error: threshold arguement was not a number") # nocov
}
a1 <- a0 / 2 / y
b1 <- a0 * y / 2
full_paras[[nm]] <- c(y, a0, a1, b1)
} else if (tforms[nm] == "exp") {
plist <- unlist(paras[nm], use.names = FALSE)
a0 <- plist[1]
y <- plist[2]
b1 <- plist[3]
if (a0 < 0) {
stop("Error: a0 arguement was negative")
}
if (is.numeric(a0)) {
# fine
} else {
stop("Error: a0 arguement was not a number") # nocov
}
if (is.numeric(y)) {
# fine
} else {
stop("Error: threshold arguement was not a number") # nocov
}
if (is.numeric(b1)) {
# fine
} else {
stop("Error: exponential arguement was not a number") # nocov
}
c1 <- log(a0) - log(b1) + b1 * y
a1 <- a0 * y + exp(c1 - b1 * y)
full_paras[[nm]] <- c(y, a0, a1, b1, c1)
}
}
return(full_paras)
}
#' Calculates The Additional Parameter For a linear-exponential formula with known maximum
#'
#' \code{Linked_Lin_Exp_Para} Calculates what the additional parameter would be for a desired maximum
#'
#' @inheritParams R_template
#'
#' @return returns parameter used by Colossus
#' @export
#' @examples
#' library(data.table)
#' y <- 7.6
#' a0 <- 1.2
#' a1_goal <- 15
#' full_paras <- Linked_Lin_Exp_Para(y, a0, a1_goal)
#'
Linked_Lin_Exp_Para <- function(y, a0, a1_goal, verbose = 0) {
verbose <- Check_Verbose(verbose)
b1 <- 10
lr <- 1.0
if (a0 < 0) {
stop("Error: a0 arguement was negative")
}
if (a1_goal > y * a0) {
# fine
} else {
stop("Error: goal is too low")
}
iter_i <- 0
while (iter_i < 100) {
iter_i <- iter_i + 1
c1 <- log(a0 / b1) + b1 * y
a1 <- a0 * y + exp(c1 - b1 * y)
a_dif <- a1 - a1_goal
if (abs(a_dif) < 1e-3) {
break
}
da1 <- -1 / b1 * exp(c1 - b1 * y)
db1 <- (a1_goal - a1) / da1
if (-1 * db1 > b1) {
db1 <- -0.9 * b1
}
b1 <- b1 + lr * db1
}
return(b1)
}
#' Splits a parameter into factors
#'
#' \code{factorize} uses user provided list of columns to define new parameter for each unique value and update the data.table.
#' Not for interaction terms
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a list with two named fields. df for the updated dataframe, and cols for the new column names
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 2, 1, 0, 1, 0)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c)
#' col_list <- c("c")
#' val <- factorize(df, col_list)
#' df <- val$df
#' new_col <- val$cols
#'
factorize <- function(df, col_list, verbose = 0) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
cols <- c()
col0 <- names(df)
tnum <- c()
for (i in seq_len(length(col_list))) {
col <- col_list[i]
x <- sort(unlist(as.list(unique(df[, col, with = FALSE])),
use.names = FALSE
))
for (j in x) {
newcol <- c(paste(col, j, sep = "_"))
df[, newcol] <- 1 * (df[, col, with = FALSE] == j)
cols <- c(cols, newcol)
tnum <- c(tnum, i)
}
}
cols <- setdiff(names(df), col0)
if (length(col_list) > 1) {
cols <- Check_Dupe_Columns(df, cols, rep(0, length(cols)), verbose, TRUE)
}
if (verbose >= 3) {
message(paste("Note: Number of factors:", length(cols), sep = "")) # nocov
}
list("df" = df, "cols" = cols)
}
#' Splits a parameter into factors in parallel
#'
#' \code{factorize_par} uses user provided list of columns to define new parameter for each unique value and update the data.table.
#' Not for interaction terms
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a list with two named fields. df for the updated dataframe, and cols for the new column names
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 2, 1, 0, 1, 0)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c)
#' col_list <- c("c")
#' val <- factorize_par(df, col_list, FALSE, 2)
#' df <- val$df
#' new_col <- val$cols
#'
factorize_par <- function(df, col_list, verbose = 0, nthreads = as.numeric(detectCores())) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
cols <- c()
vals <- c()
names <- c()
if ((identical(Sys.getenv("TESTTHAT"), "true")) || (identical(Sys.getenv("TESTTHAT_IS_CHECKING"), "true"))) {
nthreads <- 2
}
for (i in seq_len(length(col_list))) {
col <- col_list[i]
x <- sort(unlist(as.list(unique(df[, col, with = FALSE])),
use.names = FALSE
))
for (j in x) {
newcol <- c(paste(col, j, sep = "_"))
names <- c(names, newcol)
vals <- c(vals, j)
cols <- c(cols, i - 1)
}
}
df0 <- Gen_Fac_Par(
as.matrix(df[, col_list, with = FALSE]), vals,
cols, nthreads
)
df0 <- data.table::as.data.table(df0)
names(df0) <- names
col_keep <- Check_Dupe_Columns(
df0, names, rep(0, length(cols)),
verbose, TRUE
)
list("df" = cbind(df, df0), "cols" = col_keep)
}
#' Defines Interactions
#'
#' \code{interact_them} uses user provided interactions define interaction terms and update the data.table. assumes interaction is "+" or "*" and applies basic anti-aliasing to avoid duplicates
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns a list with two named fields. df for the updated dataframe, and cols for the new column names
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 2, 1, 0, 1, 0)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c)
#' interactions <- c("a?+?b", "a?*?c")
#' new_names <- c("ab", "ac")
#' vals <- interact_them(df, interactions, new_names)
#' df <- vals$df
#' new_col <- vals$cols
interact_them <- function(df, interactions, new_names, verbose = 0) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
cols <- c()
for (i in seq_len(length(interactions))) {
interac <- interactions[i]
formula <- unlist(strsplit(interac, "[?]"), use.names = FALSE)
if (length(formula) != 3) {
stop(paste(
"Error: Iteration:", interac, "has incorrect length of",
length(formula), "but should be 3."
))
}
newcol <- paste(formula[1], formula[2], formula[3], sep = "")
if (new_names[i] != "") {
newcol <- new_names[i]
}
col1 <- formula[1]
col2 <- formula[3]
if (paste(formula[1], "?", formula[2], "?", formula[3], sep = "") %in% interactions[i + seq_len(length(interactions))]) {
if (verbose >= 2) {
warning(paste("Warning: interation ", i, "is duplicated")) # nocov
}
} else if (paste(formula[3], "?", formula[2], "?", formula[1], sep = "") %in% interactions[i + seq_len(length(interactions))]) {
if (verbose >= 2) {
warning(paste(
"Warning: the reverse of interation ", i,
"is duplicated"
)) # nocov
}
} else {
if (formula[2] == "+") {
df[, newcol] <- df[, col1, with = FALSE] +
df[, col2, with = FALSE]
cols <- c(cols, newcol)
} else if (formula[2] == "*") {
df[, newcol] <- df[, col1, with = FALSE] *
df[, col2, with = FALSE]
cols <- c(cols, newcol)
} else {
stop(paste("Error: Incorrect operation of", formula[2]))
}
}
}
list("df" = df, "cols" = cols)
}
#' Defines the likelihood ratio test
#'
#' \code{Likelihood_Ratio_Test} uses two models and calculates the ratio
#'
#' @inheritParams R_template
#'
#' @return returns the score statistic
#' @export
#' @examples
#' library(data.table)
#' # In an actual example, one would run two seperate RunCoxRegression regressions,
#' # assigning the results to e0 and e1
#' e0 <- list("name" = "First Model", "LogLik" = -120)
#' e1 <- list("name" = "New Model", "LogLik" = -100)
#' score <- Likelihood_Ratio_Test(e1, e0)
#'
Likelihood_Ratio_Test <- function(alternative_model, null_model) {
if (("LogLik" %in% names(alternative_model)) && ("LogLik" %in% names(null_model))) {
return(2 * (unlist(alternative_model["LogLik"], use.names = FALSE) - unlist(null_model["LogLik"], use.names = FALSE)))
}
stop("Error: models input did not contain LogLik values")
return(NULL) # nocov
}
#' checks for duplicated column names
#'
#' \code{Check_Dupe_Columns} checks for duplicated columns, columns with the same values, and columns with single value. Currently not updated for multi-terms
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns the usable columns
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 2, 1, 0, 1, 0)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c)
#' cols <- c("a", "b", "c")
#' term_n <- c(0, 0, 1)
#' unique_cols <- Check_Dupe_Columns(df, cols, term_n)
#'
Check_Dupe_Columns <- function(df, cols, term_n, verbose = 0, factor_check = FALSE) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
if (length(cols) > 1) {
features_pair <- combn(cols, 2, simplify = FALSE) # list all column pairs
terms_pair <- combn(term_n, 2, simplify = FALSE) # list all term pairs
toRemove <- c() # init a vector to store duplicates
for (pair_n in seq_len(length(features_pair))) {
# put the pairs for testing into temp objects
pair <- unlist(features_pair[pair_n])
term <- unlist(terms_pair[pair_n])
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% names(df))) {
stop(paste("Error: ", f1, " not in data.table", sep = "")) # nocov
}
if (!(f2 %in% names(df))) {
stop(paste("Error: ", f2, " not in data.table", sep = "")) # nocov
}
t1 <- term[1]
t2 <- term[2]
checked_factor <- TRUE
if (factor_check) { #
if ((is.numeric(df[[f1]])) && (is.numeric(df[[f2]]))) {
if (sum(df[[f1]] * df[[f2]]) == 0) {
checked_factor <- FALSE
}
} else if (is.numeric(df[[f1]]) != is.numeric(df[[f2]])) {
checked_factor <- FALSE # nocov
}
}
if ((t1 == t2) && (checked_factor)) {
if (!(f1 %in% toRemove) && !(f2 %in% toRemove)) {
if (all(df[[f1]] == df[[f2]])) { # test for duplicates
if (verbose >= 2) {
warning(paste("Warning: ", f1, " and ", f2,
" are equal",
sep = ""
))
}
toRemove <- c(toRemove, f2) # build the list of duplicates
}
if (min(df[[f2]]) == max(df[[f2]])) {
if (min(df[[f2]]) == 0) {
toRemove <- c(toRemove, f2) # remove zero values
}
}
}
}
}
newcol <- setdiff(cols, toRemove)
if (length(newcol) == 1) {
if (min(df[, newcol, with = FALSE]) == max(df[, newcol, with = FALSE])) {
return(c()) # nocov
} else {
return(newcol)
}
}
return(newcol)
} else if (length(cols) == 1) {
f1 <- cols[1]
if (!(f1 %in% names(df))) {
stop(paste("Error: ", f1, " not in data.table", sep = ""))
}
if (min(df[, cols, with = FALSE]) == max(df[, cols, with = FALSE])) {
if (min(df[, cols, with = FALSE]) == 0) {
return(c())
} else {
return(cols)
}
} else {
return(cols)
}
} else {
return(c())
}
return(c()) # nocov
}
#' Applies time duration truncation limits to create columns for Cox model
#'
#' \code{Check_Trunc} creates columns to use for truncation
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns the updated data and time period columns
#' @export
#' @examples
#' library(data.table)
#' df <- data.table::data.table(
#' "UserID" = c(112, 114, 213, 214, 115, 116, 117),
#' "Starting_Age" = c(18, 20, 18, 19, 21, 20, 18),
#' "Ending_Age" = c(30, 45, 57, 47, 36, 60, 55),
#' "Cancer_Status" = c(0, 0, 1, 0, 1, 0, 0)
#' )
#' # For the interval case
#' time1 <- "Starting_Age"
#' time2 <- "Ending_Age"
#' ce <- c("%trunc%", "Ending_Age")
#' val <- Check_Trunc(df, ce)
#' df <- val$df
#' ce <- val$ce
#'
Check_Trunc <- function(df, ce, verbose = 0) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
verbose <- Check_Verbose(verbose)
if (ce[1] == "%trunc%") {
if (ce[2] == "%trunc%") {
stop("Error: Both endpoints are truncated, not acceptable")
}
tname <- ce[2]
tmin <- min(df[, get(tname)]) - 1
if (!("right_trunc" %in% names(df))) {
df[, ":="(right_trunc = tmin)]
}
ce[1] <- "right_trunc"
} else if (ce[2] == "%trunc%") {
tname <- ce[1]
tmax <- max(df[, get(tname)]) + 1
if (!("left_trunc" %in% names(df))) {
df[, ":="(left_trunc = tmax)]
}
ce[2] <- "left_trunc"
}
return(list("df" = df, "ce" = ce))
}
#' Applies time dependence to parameters
#'
#' \code{gen_time_dep} generates a new dataframe with time dependent covariates by applying a grid in time
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns the updated dataframe
#' @export
#' @examples
#' library(data.table)
#' # Adapted from the tests
#' a <- c(20, 20, 5, 10, 15)
#' b <- c(1, 2, 1, 1, 2)
#' c <- c(0, 0, 1, 1, 1)
#' df <- data.table::data.table("a" = a, "b" = b, "c" = c)
#' time1 <- "%trunc%"
#' time2 <- "a"
#' event <- "c"
#' control <- list(
#' "lr" = 0.75, "maxiter" = -1, "halfmax" = 5, "epsilon" = 1e-9,
#' "deriv_epsilon" = 1e-9, "abs_max" = 1.0,
#' "dose_abs_max" = 100.0,
#' "verbose" = FALSE, "ties" = "breslow", "double_step" = 1
#' )
#' grt_f <- function(df, time_col) {
#' return((df[, "b"] * df[, get(time_col)])[[1]])
#' }
#' func_form <- c("lin")
#' df_new <- gen_time_dep(
#' df, time1, time2, event, TRUE, 0.01, c("grt"), c(),
#' c(grt_f), paste("test", "_new.csv", sep = ""), func_form, 2
#' )
#' file.remove("test_new.csv")
#'
gen_time_dep <- function(df, time1, time2, event0, iscox, dt, new_names, dep_cols, func_form, fname, tform, nthreads = as.numeric(detectCores())) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
dfn <- names(df)
ce <- c(time1, time2, event0)
t_check <- Check_Trunc(df, ce)
df <- t_check$df
ce <- t_check$ce
time1 <- ce[1]
time2 <- ce[2]
dfn_same <- dfn[!(dfn %in% dep_cols)]
dfn_dep <- c()
for (i in seq_len(length(new_names))) {
name0 <- paste(new_names[i], 0, sep = "_")
name1 <- paste(new_names[i], 1, sep = "_")
func <- func_form[i]
df[, name0] <- lapply(func, function(f) f(df, time1))
df[, name1] <- lapply(func, function(f) f(df, time2))
dfn_dep <- c(dfn_dep, name0, name1)
}
if (length(new_names) != length(func_form)) {
stop(paste("Error: new_names vector should be the same size as the list of functions applied",
sep = ""
))
}
if (length(new_names) != length(tform)) {
stop(paste("Error: new_names vector should be the same size as the list of interpolation method used",
sep = ""
))
}
for (i in seq_len(length(tform))) {
temp <- tform[i]
if (temp != "lin") {
a <- substr(temp, 1, 5)
if (a != "step?") {
stop(paste("Error: Interpolation method not recognized: ",
temp,
sep = ""
))
}
}
}
dfn_time <- c(time1, time2)
dfn_event <- c(event0)
dfn_same <- dfn_same[!(dfn_same %in% dfn_time)]
dfn_same <- dfn_same[!(dfn_same %in% dfn_event)]
dfend <- df[get(event0) == 1, ]
tu <- sort(unlist(unique(dfend[, time2, with = FALSE]), use.names = FALSE))
if (iscox) {
df <- df[get(time2) >= min(tu), ]
df <- df[get(time1) <= max(tu), ]
}
x_time <- as.matrix(df[, dfn_time, with = FALSE])
x_dep <- as.matrix(df[, dfn_dep, with = FALSE])
x_same <- as.matrix(df[, dfn_same, with = FALSE])
x_event <- as.matrix(df[, dfn_event, with = FALSE])
if (grepl(".csv", fname, fixed = TRUE)) {
# fine
} else {
fname <- paste(fname, ".csv", sep = "_")
}
if ((identical(Sys.getenv("TESTTHAT"), "true")) || (identical(Sys.getenv("TESTTHAT_IS_CHECKING"), "true"))) {
nthreads <- 2
}
Write_Time_Dep(
x_time, x_dep, x_same, x_event, dt, fname,
tform, tu, iscox, nthreads
)
df_new <- data.table::fread(fname,
data.table = TRUE,
header = FALSE, nThread = nthreads,
col.names = c(time1, time2, new_names, dfn_same, event0)
)
data.table::setkeyv(df_new, c(event0, time2, time1))
return(df_new)
}
#' Automates creating a date difference column
#'
#' \code{Date_Shift} generates a new dataframe with a column containing time difference in a given unit
#'
#' @inheritParams R_template
#' @param dcol0 list of starting month, day, and year
#' @param dcol1 list of ending month, day, and year
#' @family Data Cleaning Functions
#' @return returns the updated dataframe
#' @export
#' @examples
#' library(data.table)
#' m0 <- c(1, 1, 2, 2)
#' m1 <- c(2, 2, 3, 3)
#' d0 <- c(1, 2, 3, 4)
#' d1 <- c(6, 7, 8, 9)
#' y0 <- c(1990, 1991, 1997, 1998)
#' y1 <- c(2001, 2003, 2005, 2006)
#' df <- data.table::data.table("m0" = m0, "m1" = m1, "d0" = d0, "d1" = d1, "y0" = y0, "y1" = y1)
#' df <- Date_Shift(df, c("m0", "d0", "y0"), c("m1", "d1", "y1"), "date_since")
#'
Date_Shift <- function(df, dcol0, dcol1, col_name, units = "days") {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
def_cols <- names(df)
df$dt0 <- paste(df[[match(dcol0[1], names(df))]],
df[[match(dcol0[2], names(df))]],
df[[match(dcol0[3], names(df))]],
sep = "-"
)
df$dt1 <- paste(df[[match(dcol1[1], names(df))]],
df[[match(dcol1[2], names(df))]],
df[[match(dcol1[3], names(df))]],
sep = "-"
)
# TO NOT ENCOUNTER DAYLIGHT SAVINGS ISSUES, THE UTC TIMEZONE IS USED
# IF NOT USED THEN RESULTS MAY HAVE TIMES OFF BY 1/24 decimals
df[, col_name] <- difftime(
strptime(df$dt1,
format = "%m-%d-%Y",
tz = "UTC"
),
strptime(df$dt0, format = "%m-%d-%Y"),
units = units,
tz = "UTC"
)
def_cols <- c(def_cols, col_name)
return(df[, def_cols, with = FALSE])
}
#' Automates creating a date since a reference column
#'
#' \code{Time_Since} generates a new dataframe with a column containing time since a reference in a given unit
#'
#' @inheritParams R_template
#' @param dcol0 list of ending month, day, and year
#' @family Data Cleaning Functions
#' @return returns the updated dataframe
#' @export
#' @examples
#' library(data.table)
#' m0 <- c(1, 1, 2, 2)
#' m1 <- c(2, 2, 3, 3)
#' d0 <- c(1, 2, 3, 4)
#' d1 <- c(6, 7, 8, 9)
#' y0 <- c(1990, 1991, 1997, 1998)
#' y1 <- c(2001, 2003, 2005, 2006)
#' df <- data.table::data.table(
#' "m0" = m0, "m1" = m1,
#' "d0" = d0, "d1" = d1,
#' "y0" = y0, "y1" = y1
#' )
#' tref <- strptime("3-22-1997", format = "%m-%d-%Y", tz = "UTC")
#' df <- Time_Since(df, c("m1", "d1", "y1"), tref, "date_since")
#'
Time_Since <- function(df, dcol0, tref, col_name, units = "days") {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
def_cols <- names(df)
df$dt0 <- paste(df[[match(dcol0[1], names(df))]], df[[match(
dcol0[2],
names(df)
)]], df[[match(dcol0[3], names(df))]], sep = "-")
df[, col_name] <- lapply(df$dt0, function(x) {
(difftime(
strptime(x,
format = "%m-%d-%Y", tz = "UTC"
), tref,
units = units
))
})
def_cols <- c(def_cols, col_name)
return(df[, def_cols, with = FALSE])
}
#' Automates creating data for a joint competing risks analysis
#'
#' \code{Joint_Multiple_Events} generates input for a regression with multiple non-independent events and models
#'
#' @inheritParams R_template
#' @param events vector of event column names
#' @param term_n_list list of vectors for term numbers for event specific or shared model elements, defaults to term 0
#' @param tform_list list of vectors for subterm types for event specific or shared model elements, defaults to loglinear
#' @param keep_constant_list list of vectors for constant elements for event specific or shared model elements, defaults to free (0)
#' @param a_n_list list of vectors for parameter values for event specific or shared model elements, defaults to term 0
#' @param name_list list of vectors for columns for event specific or shared model elements, required
#' @family Data Cleaning Functions
#' @return returns the updated dataframe and model inputs
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 0, 0, 1, 1, 1)
#' b <- c(1, 1, 1, 2, 2, 2)
#' c <- c(0, 1, 2, 2, 1, 0)
#' d <- c(1, 1, 0, 0, 1, 1)
#' e <- c(0, 1, 1, 1, 0, 0)
#' df <- data.table("t0" = a, "t1" = b, "e0" = c, "e1" = d, "fac" = e)
#' time1 <- "t0"
#' time2 <- "t1"
#' df$pyr <- df$t1 - df$t0
#' pyr <- "pyr"
#' events <- c("e0", "e1")
#' names_e0 <- c("fac")
#' names_e1 <- c("fac")
#' names_shared <- c("t0", "t0")
#' term_n_e0 <- c(0)
#' term_n_e1 <- c(0)
#' term_n_shared <- c(0, 0)
#' tform_e0 <- c("loglin")
#' tform_e1 <- c("loglin")
#' tform_shared <- c("quad_slope", "loglin_top")
#' keep_constant_e0 <- c(0)
#' keep_constant_e1 <- c(0)
#' keep_constant_shared <- c(0, 0)
#' a_n_e0 <- c(-0.1)
#' a_n_e1 <- c(0.1)
#' a_n_shared <- c(0.001, -0.02)
#' name_list <- list("shared" = names_shared, "e0" = names_e0, "e1" = names_e1)
#' term_n_list <- list("shared" = term_n_shared, "e0" = term_n_e0, "e1" = term_n_e1)
#' tform_list <- list("shared" = tform_shared, "e0" = tform_e0, "e1" = tform_e1)
#' keep_constant_list <- list(
#' "shared" = keep_constant_shared,
#' "e0" = keep_constant_e0, "e1" = keep_constant_e1
#' )
#' a_n_list <- list("shared" = a_n_shared, "e0" = a_n_e0, "e1" = a_n_e1)
#' val <- Joint_Multiple_Events(
#' df, events, name_list, term_n_list,
#' tform_list, keep_constant_list, a_n_list
#' )
#'
Joint_Multiple_Events <- function(df, events, name_list, term_n_list = list(), tform_list = list(), keep_constant_list = list(), a_n_list = list()) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
# filling missing values
for (i in names(name_list)) {
temp0 <- unlist(name_list[i], use.names = FALSE)
if (i %in% names(term_n_list)) {
temp1 <- unlist(term_n_list[i], use.names = FALSE)
if (length(temp0) != length(temp1)) {
stop(paste("Error: item ", i, " in name_list has ",
length(temp0),
" items, but same item in term_n_list has ",
length(temp1),
" items. Omit entry in term_n_list to set to default of term 0 or add missing values",
sep = ""
))
}
} else {
temp <- list(rep(0, length(temp0)))
names(temp) <- i
term_n_list <- c(term_n_list, temp)
}
if (i %in% names(tform_list)) {
temp1 <- unlist(tform_list[i], use.names = FALSE)
if (length(temp0) != length(temp1)) {
stop(paste("Error: item ", i, " in name_list has ",
length(temp0),
" items, but same item in tform_list has ",
length(temp1),
" items. Omit entry in tform_list to set to default of 'loglin' or add missing values",
sep = ""
))
}
} else {
temp <- list(rep("loglin", length(temp0)))
names(temp) <- i
tform_list <- c(tform_list, temp)
}
if (i %in% names(keep_constant_list)) {
temp1 <- unlist(keep_constant_list[i], use.names = FALSE)
if (length(temp0) != length(temp1)) {
stop(paste("Error: item ", i, " in name_list has ",
length(temp0),
" items, but same item in keep_constant_list has ",
length(temp1),
" items. Omit entry in tform_list to set to default of 0 or add missing values",
sep = ""
))
}
} else {
temp <- list(rep(0, length(temp0)))
names(temp) <- i
keep_constant_list <- c(keep_constant_list, temp)
}
if (i %in% names(a_n_list)) {
temp1 <- unlist(a_n_list[i], use.names = FALSE)
if (length(temp0) != length(temp1)) {
stop(paste("Error: item ", i, " in name_list has ",
length(temp0),
" items, but same item in a_n_list has ",
length(temp1),
" items. Omit entry in a_n_list to set to default of 0 or add missing values",
sep = ""
))
}
} else {
temp <- list(rep(0, length(temp0)))
names(temp) <- i
a_n_list <- c(a_n_list, temp)
}
}
df0 <- data.table()
for (i in names(df)) {
if (i %in% events) {
if (i == events[1]) {
temp <- c()
for (j in events) {
temp <- c(temp, unlist(df[, j, with = FALSE], use.names = FALSE))
}
df0[, "events"] <- temp
}
temp <- c()
for (j in events) {
if (i == j) {
temp <- c(temp, rep(1, nrow(df)))
} else {
temp <- c(temp, rep(0, nrow(df)))
}
}
df0[, i] <- temp
} else {
temp <- rep(unlist(df[, i, with = FALSE], use.names = FALSE), length(events))
df0[, i] <- temp
}
}
names <- c()
term_n <- c()
tform <- c()
keep_constant <- c()
a_n <- c()
if ("shared" %in% names(name_list)) {
names <- c(names, unlist(name_list["shared"], use.names = FALSE))
term_n <- c(term_n, unlist(term_n_list["shared"], use.names = FALSE))
tform <- c(tform, unlist(tform_list["shared"], use.names = FALSE))
keep_constant <- c(keep_constant, unlist(keep_constant_list["shared"],
use.names = FALSE
))
a_n <- c(a_n, unlist(a_n_list["shared"], use.names = FALSE))
}
for (i in events) {
if (i %in% names(name_list)) {
interactions <- c()
new_names <- c()
for (j in unlist(name_list[i], use.names = FALSE)) {
interactions <- c(interactions, paste(j, "?*?", i, sep = ""))
new_names <- c(new_names, paste(j, "_", i, sep = ""))
}
vals <- interact_them(df0, interactions, new_names)
df0 <- vals$df
new_names <- vals$cols
names <- c(names, new_names)
term_n <- c(term_n, unlist(term_n_list[i], use.names = FALSE))
tform <- c(tform, unlist(tform_list[i], use.names = FALSE))
keep_constant <- c(keep_constant, unlist(keep_constant_list[i],
use.names = FALSE
))
a_n <- c(a_n, unlist(a_n_list[i], use.names = FALSE))
}
}
return(list("df" = df0, "names" = names, "term_n" = term_n, "tform" = tform, "keep_constant" = keep_constant, "a_n" = a_n))
}
#' Checks system OS
#'
#' \code{get_os} checks the system OS, part of configuration script
#'
#' @return returns a string representation of OS
get_os <- function() {
sysinf <- Sys.info()
if (!is.null(sysinf)) {
os <- sysinf["sysname"]
if (os == "Darwin") {
os <- "osx" # nocov
}
} else { ## mystery machine
os <- .Platform$OS.type # nocov
if (grepl("^darwin", R.version$os)) { # nocov
os <- "osx" # nocov
}
if (grepl("linux-gnu", R.version$os)) { # nocov
os <- "linux" # nocov
}
}
tolower(os)
}
#' Checks default c++ compiler
#'
#' \code{gcc_version} Checks default c++ compiler, part of configuration script
#'
#' @return returns a string representation of gcc, clang, or c++ output
gcc_version <- function() {
out <- tryCatch(run("c++", "-v", stderr_to_stdout = TRUE),
error = function(cnd) list(stdout = "")
)
out0 <- str_match(out$stdout, "gcc version")[1]
if (!is.na(out0)) {
out <- "gcc"
} else {
out0 <- str_match(out$stdout, "clang version")[1] # nocov
if (!is.na(out0)) { # nocov
out <- "clang" # nocov
} else {
out <- out$stdout # nocov
}
}
out
}
#' Checks how R was compiled
#'
#' \code{Rcomp_version} Checks how R was compiled, part of configuration script
#'
#' @return returns a string representation of gcc, clang, or R CMD config CC output
Rcomp_version <- function() {
out <- rcmd("config", "CC")
out0 <- str_match(out$stdout, "clang")[1]
if (!is.na(out0)) {
out <- "clang" # nocov
} else {
out0 <- str_match(out$stdout, "gcc")[1]
if (!is.na(out0)) {
out <- "gcc"
} else {
out <- out$stdout # nocov
}
}
out
}
#' Checks default R c++ compiler
#'
#' \code{Rcpp_version} checks ~/.R/Makevars script for default compilers set, part of configuration script
#'
#' @return returns a string representation of gcc, clang, or head ~/.R/Makevars
Rcpp_version <- function() {
out <- tryCatch(run("head", "~/.R/Makevars", stderr_to_stdout = TRUE),
error = function(cnd) list(stdout = "")
)
out0 <- str_match(out$stdout, "clang")[1]
if (!is.na(out0)) {
out <- "clang" # nocov
} else {
out0 <- str_match(out$stdout, "gcc")[1]
if (!is.na(out0)) {
out <- "gcc" # nocov
} else {
out <- out$stdout # nocov
}
}
out
}
#' Checks OS, compilers, and OMP
#'
#' \code{System_Version} checks OS, default R c++ compiler, and if OMP is enabled
#'
#' @return returns a list of results
#' @export
#' @family Output and Information Functions
System_Version <- function() {
os <- get_os()
gcc <- gcc_version()
Rcomp <- Rcomp_version()
OMP <- OMP_Check()
list(
"Operating System" = os, "Default c++" = gcc, "R Compiler" = Rcomp,
"OpenMP Enabled" = OMP
)
}
#' General purpose verbosity check
#'
#' \code{Check_Verbose} checks and assigns verbosity values
#'
#' @inheritParams R_template
#' @family Data Cleaning Functions
#' @return returns correct verbose value
#'
Check_Verbose <- function(verbose) {
if (verbose %in% c(0, 1, 2, 3, 4)) {
verbose <- as.integer(verbose)
} else if (verbose %in% c(TRUE, FALSE)) {
if (verbose) {
verbose <- 3 # nocov
} else {
verbose <- 0 # nocov
}
} else {
stop("Error: verbosity arguement not valid")
}
verbose
}
#' uses a table, list of categories, and list of event summaries to generate person-count tables
#'
#' \code{Event_Count_Gen} generates event-count tables
#'
#' @inheritParams R_template
#' @param table dataframe with every category/event column needed
#' @param categ list with category columns and methods, methods can be either strings or lists of boundaries
#' @param events list of columns to summarize, supports counts and means and renaming the summary column
#'
#' @return returns a grouped table and a list of category boundaries used
#' @family Data Cleaning Functions
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 0, 0, 0, 1, 0)
#' table <- data.table::data.table(
#' "a" = a,
#' "b" = b,
#' "c" = c
#' )
#' categ <- list(
#' "a" = "0/3/5]7",
#' "b" = list(
#' lower = c(-1, 3, 6),
#' upper = c(3, 6, 10),
#' name = c("low", "medium", "high")
#' )
#' )
#' event <- list(
#' "c" = "count AS cases",
#' "a" = "mean", "b" = "mean"
#' )
#' e <- Event_Count_Gen(table, categ, event, T)
#'
Event_Count_Gen <- function(table, categ, events, verbose = FALSE) {
df <- as_tibble(table)
`%>%` <- dplyr::`%>%`
#
categ_cols <- c()
categ_bounds <- list()
for (cat in names(categ)) {
cat_str <- ""
if (!cat %in% names(table)) {
stop(paste(cat, " not in table", sep = ""))
}
if (length(categ[[cat]]) > 1) { # list of bounds
temp0 <- categ[[cat]]$lower
temp1 <- categ[[cat]]$upper
if ("name" %in% names(categ[[cat]])) { # assign names to the levels
temp2 <- categ[[cat]]$name
} else { # number the categories
temp2 <- seq_len(length(temp0)) # 1:length(temp0)
}
num_categ <- length(temp0) # number of categories
cat_col <- paste(cat, "category", sep = "_") # name of the category
categ_cols <- c(categ_cols, cat_col) # add to list
df <- df %>% mutate("{cat_col}" := "Unassigned") # add to tibble
for (i in 1:num_categ) { # for each category
L <- as.numeric(temp0[i]) # lower bound
if (grepl("]", temp1[i], fixed = TRUE)) {
U <- as.numeric(gsub("]", "", temp1[i])) # assign upper
a_col_categ <- case_when(df[[cat]] <= U & df[[cat]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]]) # rows within the bounds and unassigned are assigned the name
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ") # add to list of intervals
} else {
U <- as.numeric(temp1[i]) # assign upper
if (L == U) { # discrete case
a_col_categ <- case_when(df[[cat]] == U & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
} else { # interval case
a <- case_when(df[[cat]] < U & df[[cat]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, ")", sep = ""), sep = " ")
}
}
df[[cat_col]] <- a_col_categ # update column
}
categ_bounds[[cat_col]] <- cat_str # add to list of intervals
} else { # string of bounds
cat_str <- ""
temp <- categ[[cat]]
temp <- gsub("/", " / ", temp)
temp <- gsub("]", " ] ", temp)
temp <- strsplit(temp, "\\s+")[[1]] # adding and splitting by spaces
num_categ <- (length(temp) - 1) / 2 # get number of categories
cat_col <- paste(cat, "category", sep = "_")
categ_cols <- c(categ_cols, cat_col)
df <- df %>% mutate("{cat_col}" := "Unassigned") # add to tibble
for (i in 1:num_categ) {
L <- as.numeric(temp[2 * i - 1]) # get upper and lower bounds
U <- as.numeric(temp[2 * i + 1])
if (L == U) { # dicrete value case
a_col_categ <- case_when(df[[cat]] == U & df[[cat_col]] == "Unassigned" ~ as.character(i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
} else if (temp[2 * i] == "/") { # strict less than upper bound
a_col_categ <- case_when(df[[cat]] < U & df[[cat]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, ")", sep = ""), sep = " ")
} else { # within and including bounds
a_col_categ <- case_when(df[[cat]] <= U & df[[cat]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
}
df[[cat_col]] <- a_col_categ # update column
}
categ_bounds[[cat_col]] <- cat_str # add interval boundaries
}
}
df_group <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("COUNT" = n(), .groups = "drop") # group by columns and summarize by counts
for (evt in names(events)) { # iterate through events
if (grepl(" AS ", events[[evt]], fixed = TRUE)) { # get method and updated name
temp <- gsub(" AS ", " ", events[[evt]])
temp <- strsplit(temp, "\\s+")[[1]]
col_name <- temp[2]
method <- temp[1]
} else {
col_name <- evt
temp <- strsplit(events[[evt]], "\\s+")[[1]]
method <- temp[[length(temp)]]
}
if (method == "count") { # summarize by count
df_temp <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("{col_name}" := sum(.data[[evt]]), .groups = "drop")
} else if (method == "mean") { # summarize by mean
df_temp <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("{col_name}" := mean(.data[[evt]]), .groups = "drop")
}
df_group[[col_name]] <- df_temp[[col_name]] # add to grouped tibble
}
# return the grouped list and list of catgegory bounds
list(df = as.data.table(df_group), bounds = categ_bounds)
}
#' uses a table, list of categories, list of summaries, list of events, and person-year information to generate person-time tables
#'
#' \code{Event_Time_Gen} generates event-time tables
#'
#' @inheritParams R_template
#' @param table dataframe with every category/event column needed
#' @param pyr list with entry and exit lists, containing day/month/year columns in the table
#' @param categ list with category columns and methods, methods can be either strings or lists of boundaries, includes a time category or entry/exit are both required for the pyr list
#' @param summaries list of columns to summarize, supports counts, means, and weighted means by person-year and renaming the summary column
#' @param events list of events or interests, checks if events are within each time interval
#'
#' @return returns a grouped table and a list of category boundaries used
#' @family Data Cleaning Functions
#' @export
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 0, 0, 0, 1, 0)
#' d <- c(1, 2, 3, 4, 5, 6, 7)
#' e <- c(2, 3, 4, 5, 6, 7, 8)
#' f <- c(
#' 1900, 1900, 1900, 1900,
#' 1900, 1900, 1900
#' )
#' g <- c(1, 2, 3, 4, 5, 6, 7)
#' h <- c(2, 3, 4, 5, 6, 7, 8)
#' i <- c(
#' 1901, 1902, 1903, 1904,
#' 1905, 1906, 1907
#' )
#' table <- data.table::data.table(
#' "a" = a, "b" = b, "c" = c,
#' "d" = d, "e" = e, "f" = f,
#' "g" = g, "h" = h, "i" = i
#' )
#' categ <- list(
#' "a" = "-1/3/5]7",
#' "b" = list(
#' lower = c(-1, 3, 6), upper = c(3, 6, 10),
#' name = c("low", "medium", "high")
#' ),
#' "time AS time" = list(
#' "day" = c(1, 1, 1, 1, 1),
#' "month" = c(1, 1, 1, 1, 1),
#' "year" = c(1899, 1903, 1910)
#' )
#' )
#' summary <- list(
#' "c" = "count AS cases",
#' "a" = "mean",
#' "b" = "weighted_mean"
#' )
#' events <- list("c")
#' pyr <- list(
#' entry = list(year = "f", month = "e", day = "d"),
#' exit = list(year = "i", month = "h", day = "g"),
#' unit = "years"
#' )
#' e <- Event_Time_Gen(table, pyr, categ, summary, events, T)
#'
Event_Time_Gen <- function(table, pyr, categ, summaries, events, verbose = FALSE) {
df <- as_tibble(table)
`%within%` <- lubridate::`%within%`
`%>%` <- dplyr::`%>%`
#
categ_cols <- c()
categ_bounds <- list()
for (cat in names(categ)) { # for each category
cat_str <- ""
if (grepl(" AS ", cat, fixed = TRUE)) { # get the column and name
temp <- strsplit(gsub(" AS ", " ", cat), "\\s+")[[1]]
cat_col <- temp[2]
cat_df <- temp[1]
} else {
cat_df <- cat
cat_col <- paste(cat, "category", sep = "_")
}
if (cat_col %in% names(df)) { # check that the category doesn't already exist in the original dataframe
stop(paste(cat_col, " already exists, use ' AS ' to rename if needed", sep = ""))
}
if (length(categ[[cat]]) > 1) { # boundary as lists
if ("lower" %in% names(categ[[cat]])) { # lower and upper boundary intervals
if (!cat_df %in% names(table)) {
stop(paste(cat_df, " not in table", sep = ""))
}
temp0 <- categ[[cat]]$lower
temp1 <- categ[[cat]]$upper
if ("name" %in% names(categ[[cat]])) { # check for names for each level
temp2 <- categ[[cat]]$name
} else {
temp2 <- seq_len(length(temp0)) # 1:length(temp0)
}
num_categ <- length(temp0)
categ_cols <- c(categ_cols, cat_col)
df <- df %>% mutate("{cat_col}" := "Unassigned") # initialize the tibble
for (i in 1:num_categ) { # for each level
L <- as.numeric(temp0[i])
if (grepl("]", temp1[i], fixed = TRUE)) { # check for including the upper limit
U <- as.numeric(gsub("]", "", temp1[i])) # get upper limit
a_col_categ <- case_when(df[[cat_df]] <= U & df[[cat_df]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]]) # assign the level to unassigned rows
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ") # add boundary information to list of intervals
} else {
U <- as.numeric(temp1[i]) # get upper limit
if (L == U) { # discrete case
a_col_categ <- case_when(df[[cat_df]] == U & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
} else { # interval case
a_col_categ <- case_when(df[[cat_df]] < U & df[[cat_df]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(temp2[i]), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, ")", sep = ""), sep = " ")
}
}
df[[cat_col]] <- a_col_categ # update tibble
}
} else { # calender time scale
if ("day" %in% names(categ[[cat]])) { # determine the day, month, year data for the category
day <- categ[[cat]]$day # nocov
if ("month" %in% names(categ[[cat]])) { # nocov
month <- categ[[cat]]$month # nocov
} else { # nocov
month <- rep(1, length(day)) # nocov
}
if ("year" %in% names(categ[[cat]])) { # nocov
year <- categ[[cat]]$year # nocov
} else {
year <- rep(1900, length(day)) # nocov
}
} else if ("month" %in% names(categ[[cat]])) { # nocov
month <- categ[[cat]]$month # nocov
day <- rep(1, length(month)) # nocov
if ("year" %in% names(categ[[cat]])) { # nocov
year <- categ[[cat]]$year # nocov
} else { # nocov
year <- rep(1900, length(month)) # nocov
}
} else if ("year" %in% names(categ[[cat]])) { # nocov
year <- categ[[cat]]$year # nocov
day <- rep(1, length(year)) # nocov
month <- rep(1, length(month)) # nocov
} else {
stop("calender category missing 'day', 'month', and 'year'") # all three are required
}
num_categ <- length(year) - 1
categ_cols <- c(categ_cols, cat_col)
df <- df %>% mutate("{cat_col}" := "Unassigned") # initialize the tibble
if ("PYR" %in% names(df)) {
stop("either multiple time categorizations used or 'PYR' column already exists")
}
df <- df %>% mutate("PYR" := 0)
interval <- "unassigned"
if ("exit" %in% names(pyr)) { # format the exit as date column
pyr_exit <- pyr$exit
if ("year" %in% names(pyr_exit)) { # nocov
if ("month" %in% names(pyr_exit)) { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(year = table[[pyr_exit$year]], month = table[[pyr_exit$month]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(year = table[[pyr_exit$year]], month = table[[pyr_exit$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(year = table[[pyr_exit$year]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(year = table[[pyr_exit$year]]) # nocov
}
}
} else {
if ("month" %in% names(pyr_exit)) { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(month = table[[pyr_exit$month]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(month = table[[pyr_exit$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(day = table[[pyr_exit$day]]) # nocov
} else {
stop("person-year exit missing day, month, and year")
}
}
}
if ("entry" %in% names(pyr)) { # format the entry as date column
pyr_entry <- pyr$entry
interval <- "interval"
if ("year" %in% names(pyr_entry)) { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]]) # nocov
}
}
} else { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(day = table[[pyr_entry$day]]) # nocov
} else {
stop("person-year entry missing day, month, and year")
}
}
}
} else {
pyr_entry <- list()
interval <- "left trunc" # if there is an exit and no entry then the data is left truncated
}
} else if ("entry" %in% names(pyr)) {
pyr_entry <- pyr$entry # format the entry as date column
if ("year" %in% names(pyr_entry)) { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]]) # nocov
}
}
} else { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(day = table[[pyr_entry$day]]) # nocov
} else {
stop("person-year entry missing day, month, and year")
}
}
}
pyr_exit <- list()
interval <- "right trunc" # entry and no exit is right truncated
} else {
stop("Date columns given for category, but person-year data not in date format") # pyr does not have exit or entry values
}
df_added <- tibble()
pyr_unit <- "years" # default person-years to years
if ("unit" %in% names(pyr)) {
pyr_unit <- pyr$unit
}
for (time_i in 1:num_categ) { # for every time interval
istart <- make_date(year = year[time_i], month = month[time_i], day = day[time_i]) # interval start
iend <- make_date(year = year[time_i + 1], month = month[time_i + 1], day = day[time_i + 1]) # interval end
cat_str <- paste(cat_str, paste("[", istart, " to ", iend, "]", sep = ""), sep = " ") # prepare the interval info
categ_interval <- interval(istart, iend) # define as date interval
c_categ <- list()
if (interval == "left trunc") {
# only exit
a_categ <- case_when(exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = "0") # exit within interval, set to i
for (evt in events) {
c_categ[[evt]] <- case_when(exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ df[[evt]], .default = 0) # exit within interval set to event column value
}
b_categ <- case_when(a_categ == as.character(time_i) ~ as.numeric(as.duration(interval(istart, exit)), pyr_unit), .default = 0) # for every row with exit in interval, track the duration from interval start
risk_interval <- interval(iend, exit) # interval from interval end to row end
a_categ <- case_when(as.numeric(as.duration(risk_interval)) > 0 & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = a_categ) # track rows which end after the interval, set to i
b_categ <- case_when(as.numeric(as.duration(risk_interval)) > 0 & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(categ_interval), pyr_unit), .default = b_categ) # rows which end after interval are at risk the full interval
} else if (interval == "right trunc") {
# only entry
a_categ <- case_when(entry %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = "0") # start during the interval, set to i
for (evt in events) {
c_categ[[evt]] <- case_when(entry %within% categ_interval & df[[cat_col]] == "Unassigned" ~ df[[evt]], .default = 0) # start during interval, set to event value
}
b_categ <- case_when(entry %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(interval(entry, iend)), pyr_unit), .default = 0) # every row which starts during the interval, tracks duration from entry to interval end
risk_interval <- interval(entry, istart)
a_categ <- case_when(as.numeric(as.duration(risk_interval)) > 0 & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = a_categ) # tracks which rows start before the interval
b_categ <- case_when(as.numeric(as.duration(risk_interval)) > 0 & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(categ_interval), pyr_unit), .default = b_categ) # rows which start before the interval are at risk the full interval
} else {
# both entry and exit
risk_interval <- interval(entry, exit)
a_categ <- case_when(istart %within% risk_interval & iend %within% risk_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = "0") # interval fully contained, set to i
a_categ <- case_when(iend %within% risk_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = a_categ) # interval ends during row interval, set to i
a_categ <- case_when(istart %within% risk_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = a_categ) # interval starts during row interval, set to i
a_categ <- case_when(entry %within% categ_interval & exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = a_categ) # row interval fully contained in category interval, set to i
for (evt in events) {
c_categ[[evt]] <- case_when(exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ df[[evt]], .default = 0) # row ends during category interval, set to event value
}
b_categ <- case_when(exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(interval(istart, exit)), pyr_unit), .default = 0) # rows which end during the category interval, track category interval start to row end
b_categ <- case_when(entry %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(interval(entry, iend)), pyr_unit), .default = b_categ) # rows which enter during the category interval, track entry to category interval end
b_categ <- case_when(istart %within% risk_interval & iend %within% risk_interval & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(categ_interval), pyr_unit), .default = b_categ) # category interval fully in row interval, track full category interval
b_categ <- case_when(entry %within% categ_interval & exit %within% categ_interval & df[[cat_col]] == "Unassigned" ~ as.numeric(as.duration(risk_interval), pyr_unit), .default = b_categ) # row interval fully in category interval, track full row interval
}
index_kept <- seq_len(nrow(df))
index_kept <- index_kept[a_categ == time_i] # indexes which contain the category interval to some level
b_categ <- b_categ[a_categ == time_i] # durations for kept indexes
row_kept <- slice(df, index_kept) # dataframe at kept rows
row_kept <- row_kept %>% mutate("{cat_col}" := as.character(time_i)) # time category value
row_kept <- row_kept %>% mutate("PYR" := b_categ) # duration value
for (evt in events) {
d_categ <- c_categ[[evt]]
d_categ <- d_categ[a_categ == time_i]
row_kept <- row_kept %>% mutate("{evt}" := d_categ) # event values
}
df_added <- bind_rows(df_added, row_kept) # new updates dataset
}
df <- df_added
}
categ_bounds[[cat_col]] <- cat_str # update the list of category boundaries
} else { # boundary as string, not a time category
if (!cat_df %in% names(table)) {
stop(paste(cat_df, " not in table", sep = ""))
}
cat_str <- ""
temp <- categ[[cat]]
temp <- gsub("/", " / ", temp)
temp <- gsub("]", " ] ", temp)
temp <- strsplit(temp, "\\s+")[[1]] # seperate values and delimiters
num_categ <- (length(temp) - 1) / 2
categ_cols <- c(categ_cols, cat_col)
df <- df %>% mutate("{cat_col}" := "Unassigned") # initialize column
for (time_i in 1:num_categ) {
L <- as.numeric(temp[2 * time_i - 1])
U <- as.numeric(temp[2 * time_i + 1]) # get lower and upper bounds
if (L == U) { # discrete case
a_categ <- case_when(df[[cat_df]] == U & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
} else if (temp[2 * time_i] == "/") { # strictly below upper bound
a_categ <- case_when(df[[cat_df]] < U & df[[cat_df]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, ")", sep = ""), sep = " ")
} else { # including both bounds
a_categ <- case_when(df[[cat_df]] <= U & df[[cat_df]] >= L & df[[cat_col]] == "Unassigned" ~ as.character(time_i), .default = df[[cat_col]])
cat_str <- paste(cat_str, paste("[", L, ", ", U, "]", sep = ""), sep = " ")
}
df[[cat_col]] <- a_categ # update tibble
}
categ_bounds[[cat_col]] <- cat_str
}
}
if ("PYR" %in% names(df)) {
# good, we need a person-years measure
} else {
if ("exit" %in% names(pyr)) { # format the exit as date column
pyr_exit <- pyr$exit
if ("year" %in% names(pyr_exit)) { # nocov
if ("month" %in% names(pyr_exit)) { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(year = table[[pyr_exit$year]], month = table[[pyr_exit$month]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(year = table[[pyr_exit$year]], month = table[[pyr_exit$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(year = table[[pyr_exit$year]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(year = table[[pyr_exit$year]]) # nocov
}
}
} else {
if ("month" %in% names(pyr_exit)) { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(month = table[[pyr_exit$month]], day = table[[pyr_exit$day]]) # nocov
} else { # nocov
exit <- make_date(month = table[[pyr_exit$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_exit)) { # nocov
exit <- make_date(day = table[[pyr_exit$day]]) # nocov
} else {
stop("person-year exit missing day, month, and year")
}
}
}
if ("entry" %in% names(pyr)) { # format the entry as date column
pyr_entry <- pyr$entry
interval <- "interval"
if ("year" %in% names(pyr_entry)) { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]], month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(year = table[[pyr_entry$year]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(year = table[[pyr_entry$year]]) # nocov
}
}
} else { # nocov
if ("month" %in% names(pyr_entry)) { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(month = table[[pyr_entry$month]], day = table[[pyr_entry$day]]) # nocov
} else { # nocov
entry <- make_date(month = table[[pyr_entry$month]]) # nocov
}
} else { # nocov
if ("day" %in% names(pyr_entry)) { # nocov
entry <- make_date(day = table[[pyr_entry$day]]) # nocov
} else {
stop("person-year entry missing day, month, and year")
}
}
}
} else {
stop("Both entry and exit needed to calculate person-years, without a time category for reference")
}
risk_interval <- interval(entry, exit)
pyr_unit <- "years" # default person-years to years
if ("unit" %in% names(pyr)) {
pyr_unit <- pyr$unit
}
df <- df %>% mutate("PYR" := as.numeric(as.duration(risk_interval), pyr_unit))
} else {
stop("Both entry and exit needed to calculate person-years, without a time category for reference")
}
}
# check that required columns are not already in use
if ("PYR" %in% names(summaries)) { # storing person-year durations
stop("'PYR' listed as a event column, either remove or rename with ' AS '")
}
if ("AT_RISK" %in% names(summaries)) { # storing number at risk
stop("'AT_RISK' listed as a event column, either remove or rename with ' AS '")
}
df_group <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("AT_RISK" = n(), "PYR" = sum(.data[["PYR"]]), .groups = "drop") # group by categories and define the durations and counts
for (evt in names(summaries)) { # for each event summary
if (grepl(" AS ", summaries[[evt]], fixed = TRUE)) { # get the method and column name
temp <- gsub(" AS ", " ", summaries[[evt]])
temp <- strsplit(temp, "\\s+")[[1]]
col_name <- temp[2]
method <- temp[1]
} else {
col_name <- evt
temp <- strsplit(summaries[[evt]], "\\s+")[[1]]
method <- temp[[length(temp)]]
}
if (method == "count") { # sum of event across each category combination
df_temp <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("{col_name}" := sum(.data[[evt]]), .groups = "drop")
} else if (method == "mean") { # mean value of event across each category combination
df_temp <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("{col_name}" := mean(.data[[evt]]), .groups = "drop")
} else if (method == "weighted_mean") { # mean value weighted by person-years
df_temp <- df %>%
group_by(across(all_of(categ_cols))) %>%
summarize("{col_name}" := weighted.mean(.data[[evt]], .data[["PYR"]]), .groups = "drop")
}
df_group[[col_name]] <- df_temp[[col_name]]
}
#
list(df = as.data.table(df_group), bounds = categ_bounds)
}
#' Converts a string equation to regression model inputs
#'
#' \code{Convert_Model_Eq} Converts a string expression of a risk model into the vectors used by different Colossus regression functions
#'
#' @inheritParams R_template
#'
#' @return returns a list of regression inputs
#' @export
#' @family Data Cleaning Functions
#' @examples
#' library(data.table)
#' a <- c(0, 1, 2, 3, 4, 5, 6)
#' b <- c(1, 2, 3, 4, 5, 6, 7)
#' c <- c(0, 1, 0, 0, 0, 1, 0)
#' d <- c(1, 2, 3, 4, 5, 6, 7)
#' e <- c(2, 3, 4, 5, 6, 7, 8)
#' table <- data.table::data.table(
#' "a" = a, "b" = b, "c" = c,
#' "d" = d, "e" = e
#' )
#' Model_Eq <- "cox(a,b, c) ~ loglinear(d, factor(e), 0) + multiplicative()"
#' e <- Convert_Model_Eq(Model_Eq, table)
#'
Convert_Model_Eq <- function(Model_Eq, df) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
# values to assign to
term_n <- c()
tform <- c()
names <- c()
modelform <- "NONE"
surv_model_type <- "NONE"
tstart <- "NONE"
tend <- "NONE"
pyr <- "NONE"
event <- "NONE"
strata <- "NONE"
# split into the survival model type and the rest of the model terms
Model_Eq <- gsub(" ", "", Model_Eq)
first_split <- lapply(strsplit(Model_Eq, ""), function(x) which(x == "~"))[[1]]
if (length(first_split) == 0) {
stop("Error: The model equation did not contain ~")
}
surv_obj <- substr(Model_Eq, 1, first_split - 1)
model_obj <- substr(Model_Eq, first_split + 1, nchar(Model_Eq))
# split the survival model type
second_split <- lapply(strsplit(surv_obj, ""), function(x) which(x == "("))[[1]]
if (length(second_split) == 0) {
stop("Error: The left hand side did not contain (")
}
surv_type <- tolower(substr(surv_obj, 1, second_split - 1))
surv_paras <- substr(surv_obj, second_split + 1, nchar(surv_obj) - 1)
surv_paras <- strsplit(surv_paras, ",")[[1]]
# assign survival model values
surv_model_type <- surv_type
if (surv_type == "cox") {
if (length(surv_paras) == 2) {
# tend and event
tend <- surv_paras[1]
event <- surv_paras[2]
} else if (length(surv_paras) == 3) {
# tstart, tend, and event
tstart <- surv_paras[1]
tend <- surv_paras[2]
event <- surv_paras[3]
} else {
stop("Error: Invalid number of parameters for cox model")
}
} else if (surv_type == "cox_strata") {
if (length(surv_paras) == 3) {
# tend and event, then strata
tend <- surv_paras[1]
event <- surv_paras[2]
strata <- surv_paras[3]
} else if (length(surv_paras) == 4) {
# tstart, tend, and event, then strata
tstart <- surv_paras[1]
tend <- surv_paras[2]
event <- surv_paras[3]
strata <- surv_paras[4]
} else {
stop("Error: Invalid number of parameters for cox_strata model")
}
} else if (surv_type == "poisson") {
if (length(surv_paras) == 2) {
pyr <- surv_paras[1]
event <- surv_paras[2]
} else {
stop("Error: Invalid number of parameters for poisson model")
}
} else if (surv_type == "poisson_strata") {
if (length(surv_paras) > 2) {
pyr <- surv_paras[1]
event <- surv_paras[2]
strata <- surv_paras[3:length(surv_paras)]
} else {
stop("Error: Invalid number of parameters for poisson model with stratification")
}
} else {
stop("Error: Invalid survival model type for poisson")
}
#
right_model_terms <- strsplit(model_obj, "\\+")[[1]]
for (term_i in seq_along(right_model_terms)) {
# seperate the term type or model-formula from parameters
third_split <- lapply(strsplit(right_model_terms[term_i], ""), function(x) which(x == "("))[[1]]
if (length(third_split) == 0) {
stop(paste('Error: right hand side element "', right_model_terms[term_i], '" did not contain (', sep = ""))
}
model_type <- tolower(substr(right_model_terms[term_i], 1, third_split - 1))
model_type <- gsub("_", "-", model_type)
tform_acceptable <- c(
"plin", "lin", "loglin", "loglin-dose", "lin-dose",
"lin-quad-dose", "lin-exp-dose", "plinear", "product-linear", "linear",
"loglinear", "log-linear", "loglinear-dose", "log-linear-dose", "linear-dose", "linear-piecewise",
"quadratic", "quad", "quad-dose", "quadratic-dose",
"step-dose", "step-piecewise",
"linear-quadratic-dose", "linear-quadratic-piecewise",
"linear-exponential-dose", "linear-exponential-piecewise"
)
modelform_acceptable <- c(
"multiplicative", "multiplicative-excess", "additive", "product-additive",
"product-additive-excess", "a", "pa", "pae", "m", "me",
"gmix", "geometric-mixture", "gmix-r", "relative-geometric-mixture",
"gmix-e", "excess-geometric-mixture"
)
if (model_type %in% tform_acceptable) {
model_paras <- substr(right_model_terms[term_i], third_split + 1, nchar(right_model_terms[term_i]) - 1)
model_paras <- strsplit(model_paras, ",")[[1]]
last_entry <- model_paras[length(model_paras)]
if (is.na(suppressWarnings(as.integer(last_entry)))) {
# the last element isn't an integer
term_num <- 0
} else {
model_paras <- model_paras[1:(length(model_paras) - 1)]
term_num <- as.integer(last_entry)
}
for (subterm_i in seq_along(model_paras)) {
# add element
if (substr(model_paras[subterm_i], 1, 7) == "factor(") {
# baseline is set by using factor(column;baseline=level)
col_name <- substr(model_paras[subterm_i], 8, nchar(model_paras[subterm_i]) - 1)
base_split <- lapply(strsplit(col_name, ""), function(x) which(x == ";"))[[1]]
if (length(base_split) == 1) {
col_names <- substr(col_name, 1, base_split - 1)
base_line <- substr(col_name, base_split + 1 + 9, nchar(col_name))
base_level <- paste(col_names, base_line, sep = "_")
#
val <- factorize(df, col_names)
df <- val$df
col_name <- val$cols
if (base_level %in% names(df)) {
df <- df[, (base_level) := NULL]
col_name <- col_name[!col_name == base_level]
} else {
warning(paste("Warning: Baseline level ", base_level, " not found.", sep = ""))
}
} else {
val <- factorize(df, col_name)
df <- val$df
col_name <- val$cols
}
} else {
col_name <- c(model_paras[subterm_i])
}
# convert subterm formula
model_terms <- c(model_type)
if (model_type %in% c("plin", "plinear", "product-linear")) {
model_terms <- c("plin")
} else if (model_type %in% c("lin", "linear")) {
model_terms <- c("lin")
} else if (model_type %in% c("loglin", "loglinear", "log-linear")) {
model_terms <- c("loglin")
} else if (model_type %in% c("loglin-dose", "loglinear-dose", "log-linear-dose")) {
model_terms <- c("loglin_slope", "loglin_top")
} else if (model_type %in% c("lin-dose", "linear-dose", "linear-piecewise")) {
model_terms <- c("lin_slope", "lin_int")
} else if (model_type %in% c("quadratic", "quad", "quad-dose", "quadratic-dose")) {
model_terms <- c("quad_slope")
} else if (model_type %in% c("step-dose", "step-piecewise")) {
model_terms <- c("step_slope", "step_int")
} else if (model_type %in% c("lin-quad-dose", "linear-quadratic-dose", "linear-quadratic-piecewise")) {
model_terms <- c("lin_quad_slope", "lin_quad_int")
} else if (model_type %in% c("lin-exp-dose", "linear-exponential-dose", "linear-exponential-piecewise")) {
model_terms <- c("lin_exp_slope", "lin_exp_int", "lin_exp_exp_slope")
} else {
stop(paste("Error: Unknown subterm type used, ", model_type, sep = ""))
}
for (col in col_name) {
for (model_term in model_terms) {
names <- c(names, col)
tform <- c(tform, model_term)
term_n <- c(term_n, term_num)
}
}
}
} else if (model_type %in% modelform_acceptable) {
if (model_type %in% c("m", "me", "multiplicative", "multiplicative-excess")) {
model_type <- "M"
} else if (model_type %in% c("a", "additive")) {
model_type <- "A"
} else if (model_type %in% c("pa", "product-additive")) {
model_type <- "PA"
} else if (model_type %in% c("pae", "product-additive-excess")) {
model_type <- "PAE"
} else if (model_type %in% c("gmix", "geometric-mixture")) {
model_type <- "GMIX"
} else if (model_type %in% c("gmix-r", "relative-geometric-mixture")) {
model_type <- "GMIX-R"
} else if (model_type %in% c("gmix-e", "excess-geometric-mixture")) {
model_type <- "GMIX-E"
}
if (modelform == "NONE") {
modelform <- model_type
} else {
stop("Error: modelform defined twice")
}
} else {
stop(paste("Error: Unknown option encountered, ", model_type, sep = ""))
}
}
if (modelform == "NONE") {
modelform <- "M"
}
return(list(
"term_n" = term_n, "tform" = tform, "names" = names, "modelform" = modelform,
"survival_model_type" = surv_model_type,
"start_age" = tstart, "end_age" = tend, "person_year" = pyr, "event" = event, "strata" = strata,
"data" = df
))
}
#' Prints a regression output clearly
#'
#' \code{Interpret_Output} uses the list output from a regression, prints off a table of results and summarizes the score and convergence.
#'
#' @inheritParams R_template
#'
#' @return return nothing, prints the results to console
#' @export
#' @family Output and Information Functions
#' @examples
#' library(data.table)
#' ## basic example code reproduced from the starting-description vignette
#' df <- data.table::data.table(
#' "UserID" = c(112, 114, 213, 214, 115, 116, 117),
#' "Starting_Age" = c(18, 20, 18, 19, 21, 20, 18),
#' "Ending_Age" = c(30, 45, 57, 47, 36, 60, 55),
#' "Cancer_Status" = c(0, 0, 1, 0, 1, 0, 0),
#' "a" = c(0, 1, 1, 0, 1, 0, 1),
#' "b" = c(1, 1.1, 2.1, 2, 0.1, 1, 0.2),
#' "c" = c(10, 11, 10, 11, 12, 9, 11),
#' "d" = c(0, 0, 0, 1, 1, 1, 1),
#' "e" = c(0, 0, 1, 0, 0, 0, 1)
#' )
#' # For the interval case
#' time1 <- "Starting_Age"
#' time2 <- "Ending_Age"
#' event <- "Cancer_Status"
#' names <- c("a", "b", "c", "d")
#' a_n <- list(c(1.1, -0.1, 0.2, 0.5), c(1.6, -0.12, 0.3, 0.4))
#' # used to test at a specific point
#' term_n <- c(0, 1, 1, 2)
#' tform <- c("loglin", "lin", "lin", "plin")
#' modelform <- "M"
#' keep_constant <- c(0, 0, 0, 0)
#' control <- list(
#' "ncores" = 2, "lr" = 0.75, "maxiters" = c(5, 5, 5),
#' "halfmax" = 5, "epsilon" = 1e-3, "deriv_epsilon" = 1e-3,
#' "abs_max" = 1.0, "dose_abs_max" = 100.0,
#' "verbose" = FALSE,
#' "ties" = "breslow", "double_step" = 1, "guesses" = 2
#' )
#' e <- RunCoxRegression_Omnibus(df, time1, time2, event,
#' names, term_n, tform, keep_constant,
#' a_n, modelform, control,
#' model_control = list(
#' "single" = FALSE,
#' "basic" = FALSE, "cr" = FALSE, "null" = FALSE
#' )
#' )
#' Interpret_Output(e)
#'
Interpret_Output <- function(out_list, digits = 2) {
tryCatch(
{
df <- setDT(df)
},
error = function(e) {
df <- data.table(df)
}
)
# make sure the output isn't an error
passed <- out_list$Status
message("|-------------------------------------------------------------------|")
if (!is.na(passed)) {
if ("Likelihood_Goal" %in% names(out_list)) {
# likelihood boundary output
limits <- out_list$Parameter_Limits
neg <- out_list$Negative_Risk_Limit_Hit
conv <- out_list$Limit_Converged
lik_bound <- out_list$Likelihood_Boundary
lik_goal <- out_list$Likelihood_Goal
message("Likelihood Boundary Results")
if (neg[1]) {
message("Lower limit was not found")
} else {
if (conv[1]) {
message(paste("Lower limit converged to at ", round(limits[1], digits), " at a score of ", round(lik_bound[1], digits), " with of goal of ", round(lik_goal, digits), sep = ""))
} else {
message(paste("Lower limit reached ", round(limits[1], digits), " at a score of ", round(lik_bound[1], digits), " with of goal of ", round(lik_goal, digits), " but did not converge", sep = ""))
}
}
if (neg[2]) {
message("Upper limit was not found")
} else {
if (conv[2]) {
message(paste("Upper limit converged to at ", round(limits[2], digits), " at a score of ", round(lik_bound[2], digits), " with of goal of ", round(lik_goal, digits), sep = ""))
} else {
message(paste("Upper limit reached ", round(limits[2], digits), " at a score of ", round(lik_bound[2], digits), " with of goal of ", round(lik_goal, digits), " but did not converge", sep = ""))
}
}
} else {
# Check if its a multidose problem
if (out_list$Survival_Type == "Cox_Multidose") {
message("Currently the multiple realization code is not setup for printing results, due to the potentially large number of realizations")
} else if (out_list$Survival_Type == "CaseControl") {
# case control output
# get the model details
names <- out_list$Parameter_Lists$names
tforms <- out_list$Parameter_Lists$tforms
term_n <- out_list$Parameter_Lists$term_n
beta_0 <- out_list$beta_0
stdev <- out_list$Standard_Deviation
pval <- pnorm(-abs(beta_0 / stdev))
strata_odds <- out_list$StrataOdds
res_table <- data.table(
"Covariate" = names,
"Subterm" = tforms,
"Term Number" = term_n,
"Central Estimate" = beta_0,
"Standard Error" = stdev,
"1-tail p-value" = pval
)
message("Final Results")
print(res_table)
deviance <- out_list$Deviance
iteration <- out_list$Control_List$Iteration
step_max <- out_list$Control_List$`Maximum Step`
deriv_max <- out_list$Control_List$`Derivative Limiting`
converged <- out_list$Converged
#
freepara <- out_list$FreeParameters
freestrata <- out_list$FreeSets
#
message("\nMatched Case-Control Model Used")
message(paste("Deviance: ", round(deviance, digits), sep = ""))
message(paste(freestrata, " out of ", length(strata_odds), " matched sets used Unconditional Likelihood", sep = ""))
if (!is.null(converged)) {
message(paste("Iterations run: ", iteration, "\nmaximum step size: ", formatC(step_max, format = "e", digits = digits), ", maximum first derivative: ", formatC(deriv_max, format = "e", digits = digits), sep = ""))
if (converged) {
message("Analysis converged")
} else {
message("Analysis did not converge, check convergence criteria or run further")
}
}
} else {
# get the model details
names <- out_list$Parameter_Lists$names
tforms <- out_list$Parameter_Lists$tforms
term_n <- out_list$Parameter_Lists$term_n
beta_0 <- out_list$beta_0
stdev <- out_list$Standard_Deviation
pval <- 2 * pnorm(-abs(beta_0 / stdev))
res_table <- data.table(
"Covariate" = names,
"Subterm" = tforms,
"Term Number" = term_n,
"Central Estimate" = beta_0,
"Standard Error" = stdev,
"2-tail p-value" = pval
)
message("Final Results")
print(res_table)
# get the model results
LogLik <- out_list$LogLik
AIC <- out_list$AIC
BIC <- out_list$BIC
deviation <- out_list$Deviation
iteration <- out_list$Control_List$Iteration
step_max <- out_list$Control_List$`Maximum Step`
deriv_max <- out_list$Control_List$`Derivative Limiting`
converged <- out_list$Converged
if (is.null(deviation)) {
# cox model
message("\nCox Model Used")
message(paste("-2*Log-Likelihood: ", round(-2 * LogLik, digits), ", AIC: ", round(AIC, digits), sep = ""))
} else {
# poisson model
message("\nPoisson Model Used")
message(paste("-2*Log-Likelihood: ", round(-2 * LogLik, digits), ", Deviation: ", round(deviation, digits), ", AIC: ", round(AIC, digits), ", BIC: ", round(BIC, digits), sep = ""))
}
if (!is.null(converged)) {
message(paste("Iterations run: ", iteration, "\nmaximum step size: ", formatC(step_max, format = "e", digits = digits), ", maximum first derivative: ", formatC(deriv_max, format = "e", digits = digits), sep = ""))
if (converged) {
message("Analysis converged")
} else {
message("Analysis did not converge, check convergence criteria or run further")
}
}
}
}
} else {
message(paste("Regression Failed"))
}
if ("RunTime" %in% names(out_list)) {
run_time_sec <- as.numeric(out_list$RunTime, units = "secs")
run_time_min <- as.numeric(out_list$RunTime, units = "mins")
run_time_hour <- as.numeric(out_list$RunTime, units = "hours")
if (run_time_sec < 60) {
message(paste("Run finished in ", round(run_time_sec, digits), " seconds", sep = ""))
} else if (run_time_min < 60) {
message(paste("Run finished in ", round(run_time_min, digits), " minutes", sep = ""))
} else {
message(paste("Run finished in ", round(run_time_hour, digits), " hours", sep = ""))
}
# message(paste("Run finished in ", out_list$RunTime))
}
message("|-------------------------------------------------------------------|")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.