# This file contains all the helper funcitons that the outer exported
# functions utilize. I try to have a pr at the start of the name for all
# the private functions.
#
# Author: max
###############################################################################
#' Looks for unique rowname match without grep
#'
#' Since a rowname may contain characters reserved by regular
#' expressions I've found it easier to deal with the rowname
#' finding by just checking for matching strings at the beginning
#' of the name while at the same time excluding names that have the
#' same stem, i.e. DM and DM_COMP will cause an issue since DM will
#' match both rows.
#'
#' @param rnames A vector with the rownames that are looked for
#' @param vn The variable name that is of interest
#' @param vars A vector with all the names and the potentially competing names
#' @return integer A vector containing the position of the matches
#'
#' TODO: remove this function in favor of the more powerful prMapVariable2Name
#' @keywords internal
prFindRownameMatches <- function(rnames, vn, vars) {
# Find the beginning of the string that matches exactly to the var. name
name_stub <- substr(rnames, 1, nchar(vn))
matches <- which(name_stub == vn)
# Since the beginning of the name may not be unique we need to
# check for other "competing matches"
# TODO: make this fix more elegant
vars_name_stub <- substr(vars, 1, nchar(vn))
if (sum(vars_name_stub == vn) > 1) {
competing_vars <- vars[vars != vn &
vars_name_stub == vn]
competing_matches <- NULL
for (comp_vn in competing_vars) {
competing_name_stub <- substr(rnames, 1, nchar(comp_vn))
competing_matches <-
c(
competing_matches,
which(competing_name_stub == comp_vn)
)
}
# Clean out competing matches
matches <- matches[!matches %in% competing_matches]
}
return(matches)
}
#' Get model outcome
#'
#' Uses the model to extract the outcome variable. Throws
#' error if unable to find the outcome.
#'
#' @param model The fitted model
#' @param mf The dataset that the model is fitted to - if missing it
#' uses the \code{\link[stats]{model.frame}()} dataset. This can cause
#' length issues as there may be variables that are excluded from the
#' model for different reasons.
#' @return vector
#' @importFrom stats as.formula
#'
#' @keywords internal
prExtractOutcomeFromModel <- function(model, mf) {
if (missing(mf)) {
mf <- model.frame(model)
outcome <- mf[, names(mf) == deparse(as.formula(model)[[2]])]
} else {
outcome <- eval(as.formula(model)[[2]], envir = mf)
}
if (is.null(outcome)) {
stop(
"Could not identify the outcome: ", deparse(as.formula(model)[[2]]),
" among the model.frame variables: '", paste(names(mf), collapse = "', '"), "'"
)
}
# Only use the status when used for survival::Surv objects
if (inherits(outcome, "Surv")) {
return(outcome[, "status"])
}
return(outcome)
}
#' Get model data.frame
#'
#' Returns the raw variables from the original data
#' frame using the \code{\link[stats:model.frame]{get_all_vars}()}
#' but with the twist that it also performs any associated
#' subsetting based on the model's \code{\link[base]{subset}()} argument.
#'
#' @param x The fitted model.
#' @param terms_only Only use the right side of the equation by selecting the terms
#' @param term.label Sometimes need to retrieve specific spline labels that are not among
#' the `labels(terms(x))`
#' @return data.frame
#' @importFrom stats get_all_vars
#'
#' @importFrom stats terms
#' @keywords internal
prGetModelData <- function(x, terms_only = FALSE, term.label) {
# Extract the variable names
true_vars <- all.vars(as.formula(x))
# Get the environment of the formula
env <- environment(as.formula(x))
data <- eval(x$call$data,
envir = env
)
# The data frame without the
mf <- get_all_vars(as.formula(x),
data = data
)
if (terms_only) {
cols2keep <- labels(terms(x))
if (!missing(term.label)) {
cols2keep <- c(cols2keep, term.label)
}
mf <- mf[, names(mf) %in% cols2keep, drop = FALSE]
}
if (!is.null(x$call$subset)) {
if (!is.null(data)) {
# As we don't know if the subsetting argument
# contained data from the data frame or the environment
# we need this additional check
mf <- tryCatch(mf[eval(x$call$subset,
envir = data,
enclos = env
), ],
error = function(e) {
stop("Could not deduce the correct subset argument when extracting the data. ", e)
}
)
} else {
mf <- mf[eval(x$call$subset,
envir = env
), ]
}
}
return(mf)
}
#' Get the models variables
#'
#' This function extract the modelled variables. Any interaction
#' terms are removed as those should already be represented by
#' the individual terms.
#'
#' @param model A model fit
#' @param remove_splines If splines, etc. should be cleaned
#' from the variables as these no longer are "pure" variables
#' @param remove_interaction_vars If interaction variables are
#' not interesting then these should be removed. Often in
#' the case of \code{\link{printCrudeAndAdjustedModel}()} it is impossible
#' to properly show interaction variables and it's better to show
#' these in a separate table
#' @param add_intercept Adds the intercept if it exists
#' @return vector with names
#'
#' @importFrom stringr str_split
#' @importFrom stringr str_trim
#' @keywords internal
prGetModelVariables <- function(model,
remove_splines = TRUE,
remove_interaction_vars = FALSE,
add_intercept = FALSE) {
# We need the call names in order to identify
# - interactions
# - functions such as splines, I()
if (inherits(model, "nlme")) {
vars <- attr(model$fixDF$terms, "names")
} else {
vars <- attr(model$terms, "term.labels")
}
strata <- NULL
if (any(grepl("^strat[a]{0,1}\\(", vars))) {
strata <- vars[grep("^strat[a]{0,1}\\(", vars)]
vars <- vars[-grep("^strat[a]{0,1}\\(", vars)]
}
cluster <- NULL
if (any(grepl("^cluster{0,1}\\(", vars))) {
cluster <- vars[grep("^cluster{0,1}\\(", vars)]
vars <- vars[-grep("^cluster{0,1}\\(", vars)]
}
# Fix for bug in cph and coxph
if (is.null(cluster) &&
inherits(model, c("cph", "coxph"))) {
alt_terms <- stringr::str_trim(strsplit(deparse(model$call$formula[[3]]),
"+",
fixed = TRUE
)[[1]])
if (any(grepl("^cluster{0,1}\\(", alt_terms))) {
cluster <- alt_terms[grep("^cluster{0,1}\\(", alt_terms)]
}
}
# Remove I() as these are not true variables
unwanted_vars <- grep("^I\\(.*$", vars)
if (length(unwanted_vars) > 0) {
attr(vars, "I() removed") <- vars[unwanted_vars]
vars <- vars[-unwanted_vars]
}
pat <- "^[[:alpha:]\\.]+[^(]+\\(.*$"
fn_vars <- grep(pat, vars)
if (length(fn_vars) > 0) {
if (remove_splines) {
# Remove splines and other functions
attr(vars, "functions removed") <- vars[fn_vars]
vars <- vars[-fn_vars]
} else {
# Cleane the variable names into proper names
# the assumption here is that the real variable
# name is the first one in the parameters
pat <- "^[[:alpha:]\\.]+.*\\(([^,)]+).*$"
vars[fn_vars] <- sub(pat, "\\1", vars[fn_vars])
}
}
# Remove interaction terms as these are not variables
int_term <- "^.+:.+$"
in_vars <- grep(int_term, vars)
if (length(in_vars) > 0) {
if (remove_interaction_vars) {
in_vn <- unlist(str_split(vars[in_vars], ":"),
use.names = FALSE
)
in_vars <- unique(c(in_vars, which(vars %in% in_vn)))
}
attr(vars, "interactions removed") <- vars[in_vars]
vars <- vars[-in_vars]
}
if (add_intercept &&
grepl("intercept", names(coef(model))[1], ignore.case = TRUE)) {
vars <- c(
names(coef(model))[1],
vars
)
}
clean_vars <- unique(vars)
attributes(clean_vars) <- attributes(vars)
if (!is.null(strata)) {
attr(clean_vars, "strata") <- strata
}
if (!is.null(cluster)) {
attr(clean_vars, "cluster") <- cluster
}
return(clean_vars)
}
#' Get statistics according to the type
#'
#' A simple function applied by the \code{\link[Gmisc]{getDescriptionStatsBy}()}
#' for the total column. This function is also used by \code{\link{printCrudeAndAdjustedModel}()}
#' in case of a basic linear regression is asked for a raw stat column
#'
#' @param x The variable that we want the statistics for
#' @param show_perc If this is a factor/proportion variable then we
#' might want to show the percentages
#' @param html If the output should be in html or LaTeX formatting
#' @param digits Number of decimal digits
#' @param numbers_first If number is to be prior to the percentage
#' @param useNA If missing should be included
#' @param show_all_values This is by default false as for instance if there is
#' no missing and there is only one variable then it is most sane to only show
#' one option as the other one will just be a complement to the first. For instance
#' sex - if you know gender then automatically you know the distribution of the
#' other sex as it's 100 \% - other \%.
#' @param continuous_fn A function for describing continuous variables
#' defaults to \code{\link{describeMean}()}
#' @param prop_fn A function for describing proportions, defaults to
#' the factor function
#' @param factor_fn A function for describing factors, defaults to
#' \code{\link{describeFactors}()}
#' @param percentage_sign If you want to suppress the percentage sign you
#' can set this variable to FALSE. You can also choose something else that
#' the default \% if you so wish by setting this variable.
#' @return A matrix or a vector depending on the settings
#'
#' TODO: Use the Gmisc function instead of this copy
#'
#' @importFrom Gmisc describeMean
#' @importFrom Gmisc describeFactors
#' @keywords internal
prGetStatistics <- function(x,
show_perc = FALSE,
html = TRUE,
digits = 1,
numbers_first = TRUE,
useNA = "no",
show_all_values = FALSE,
continuous_fn = describeMean,
factor_fn = describeFactors,
prop_fn = factor_fn,
percentage_sign = percentage_sign) {
useNA <- prConvertShowMissing(useNA)
if (is.factor(x) ||
is.logical(x) ||
is.character(x)) {
if (length(unique(x)) == 2) {
if (show_perc) {
total_table <- prop_fn(x,
html = html,
digits = digits,
number_first = numbers_first,
useNA = useNA,
percentage_sign = percentage_sign
)
} else {
total_table <- table(x, useNA = useNA)
names(total_table)[is.na(names(total_table))] <- "Missing"
# Choose only the reference level
# Note: Currently references are required
if (show_all_values == FALSE && FALSE) {
total_table <- total_table[names(total_table) %in% c(levels(x)[1], "Missing")]
}
}
} else {
if (show_perc) {
total_table <- factor_fn(x,
html = html,
digits = digits,
number_first = numbers_first,
useNA = useNA,
percentage_sign = percentage_sign
)
} else {
total_table <- table(x, useNA = useNA)
names(total_table)[is.na(names(total_table))] <- "Missing"
}
}
} else {
total_table <- continuous_fn(x,
html = html, digits = digits,
number_first = numbers_first,
useNA = useNA
)
# If a continuous variable has two rows then it's assumed that the second is the missing
if (length(total_table) == 2 &&
show_perc == FALSE) {
total_table[2] <- sum(is.na(x))
}
}
return(total_table)
}
#' A function for converting a useNA variable
#'
#' The variable is suppose to be directly compatible with
#' table(..., useNA = useNA). It throughs an error
#' if not compatible
#'
#' @param useNA Boolean or "no", "ifany", "always"
#' @return string
#'
#' @keywords internal
prConvertShowMissing <- function(useNA) {
if (useNA == FALSE || useNA == "no") {
useNA <- "no"
} else if (useNA == TRUE) {
useNA <- "ifany"
}
if (!useNA %in% c("no", "ifany", "always")) {
stop(sprintf("You have set an invalid option for useNA variable, '%s' ,it should be boolean or one of the options: no, ifany or always.", useNA))
}
return(useNA)
}
#' A function that tries to resolve what variable corresponds to what row
#'
#' As both the \code{\link{getCrudeAndAdjustedModelData}()} and the
#' \code{\link{printCrudeAndAdjustedModel}()} need to now exactly
#' what name from the \code{\link[stats]{coef}()}/\code{\link[rms]{summary.rms}()}
#' correspond to we for generalizeability this rather elaborate function.
#'
#' @param var_names The variable names that are saught after
#' @param available_names The names that are available to search through
#' @param data The data set that is saught after
#' @param force_match Whether all variables need to be identified or not.
#' E.g. you may only want to use some variables and already pruned the
#' \code{available_names} and therefore wont have matches. This is the
#' case when \code{\link{getCrudeAndAdjustedModelData}()} has been used together
#' with the \code{var_select} argument.
#' @return \code{list} Returns a list with each element has the corresponding
#' variable name and a subsequent list with the parameters \code{no_rows}
#' and \code{location} indiciting the number of rows corresponding to that
#' element and where those rows are located. For factors the list also contains
#' \code{lvls} and \code{no_lvls}.
#' @keywords internal
#' @import utils
prMapVariable2Name <- function(var_names, available_names,
data, force_match = TRUE) {
if (any(duplicated(available_names))) {
stop(
"You have non-unique names. You probably need to adjust",
" (1) variable names or (2) factor labels."
)
}
# Start with figuring out how many rows each variable
var_data <- list()
for (name in var_names) {
if (grepl("intercept", name, ignore.case = TRUE)) {
var_data[[name]] <-
list(no_rows = 1)
} else if (is.factor(data[, name])) {
var_data[[name]] <-
list(lvls = levels(data[, name]))
# Sometimes due to subsetting some factors don't exist
# we therefore need to remove those not actually in the dataset
var_data[[name]]$lvls <-
var_data[[name]]$lvls[var_data[[name]]$lvls %in%
as.character(unique(data[, name][!is.na(data[, name])]))]
var_data[[name]][["no_lvls"]] <- length(var_data[[name]]$lvls)
var_data[[name]][["no_rows"]] <- length(var_data[[name]]$lvls) - 1
} else {
var_data[[name]] <-
list(no_rows = 1)
}
}
# A function for stripping the name and the additional information
# from the available name in order to get the cleanest form
getResidualCharacters <- function(search, conflicting_name) {
residual_chars <- substring(conflicting_name, nchar(search) + 1)
if (!is.null(var_data[[search]]$lvls)) {
best_resid <- residual_chars
for (lvl in var_data[[search]]$lvls) {
new_resid <- sub(lvl, "", residual_chars,
fixed = TRUE
)
if (nchar(new_resid) < nchar(best_resid)) {
best_resid <- new_resid
if (nchar(new_resid) == 0) {
break
}
}
}
residual_chars <- best_resid
}
return(residual_chars)
}
matched_names <- c()
matched_numbers <- c()
org_available_names <- available_names
# Start with simple non-factored variables as these should give a single-line match
# then continue with the longest named variable
for (name in var_names[order(sapply(var_data, function(x) is.null(x$lvls)),
nchar(var_names),
decreasing = TRUE
)]) {
matches <- which(name == substr(available_names, 1, nchar(name)))
if (length(matches) == 0) {
if (force_match) {
stop(
"Sorry but the function could not find a match for '", name, "'",
" among any of the available names: '", paste(org_available_names,
collapse = "', '"
), "'"
)
}
} else if (length(matches) == 1) {
if (var_data[[name]]$no_rows != 1) {
stop(
"Expected more than one match for varible '", name, "'",
" the only positive match was '", available_names[matches], "'"
)
}
} else if (length(var_names) > length(matched_names) + 1) {
if (is.null(var_data[[name]]$lvls) &&
sum(name == available_names) == 1) {
# Check if the searched for variable is a non-factor variable
# if so then match if there is a perfect match
matches <- which(name == available_names)
} else if (length(var_names) > length(matched_names) + 1) {
# Check that there is no conflicting match
conflicting_vars <- var_names[var_names != name &
!var_names %in% matched_names]
possible_conflicts <- c()
for (conf_var in conflicting_vars) {
possible_conflicts <-
union(
possible_conflicts,
which(substr(available_names, 1, nchar(conflicting_vars)) %in%
conflicting_vars)
)
}
conflicts <- intersect(possible_conflicts, matches)
if (length(conflicts) > 0) {
conflicting_vars <- conflicting_vars[sapply(
conflicting_vars,
function(search) {
any(search == substr(available_names, 1, nchar(search)))
}
)]
for (conflict in conflicts) {
# We will try to find a better match that leaves fewer "residual characters"
# than what we started with
start_res_chars <- getResidualCharacters(name, available_names[conflict])
best_match <- NULL
best_conf_name <- NULL
for (conf_name in conflicting_vars) {
resid_chars <- getResidualCharacters(conf_name, available_names[conflict])
if (is.null(best_match) ||
nchar(best_match) > nchar(resid_chars)) {
best_match <- resid_chars
best_conf_name <- conf_name
}
}
if (nchar(start_res_chars) == nchar(best_match)) {
stop(
"The software can't decide which name belongs to which variable.",
" The variable that is searched for is '", name, "'",
" and there is a conflict with the variable '", best_conf_name, "'.",
" The best match for '", name, "' leaves: '", start_res_chars, "'",
" while the conflict '", best_conf_name, "' leaves: '", best_match, "'",
" when trying to match the name: '", available_names[conflict], "'"
)
} else if (nchar(start_res_chars) > nchar(best_match)) {
# Now remove the matched row if we actually found a better match
matches <- matches[matches != conflict]
}
}
}
}
if (length(matches) == 0) {
stop(
"Could not identify the rows corresponding to the variable '", name, "'",
" this could possibly be to similarity between different variable names",
" and factor levels. Try to make sure that all variable names are unique",
" the variables that are currently looked for are:",
" '", paste(var_names,
collapse = "', '"
),
"'."
)
}
}
# Check that multiple matches are continuous, everything else is suspicious
if (length(matches) > 1) {
matches <- matches[order(matches)]
if (any(1 != tail(matches, length(matches) - 1) -
head(matches, length(matches) - 1))) {
stop(
"The variable '", name, "' failed to provide an adequate",
" consequent number of matches, the names matched are located at:",
" '", paste(matches, collapse = "', '"), "'"
)
}
}
# Since we remove the matched names we need to look back at the original and
# find the exact match in order to deduce the true number
true_matches <- which(org_available_names %in%
available_names[matches])
# Avoid accidentally rematching
true_matches <- setdiff(true_matches, matched_numbers)
var_data[[name]][["location"]] <- true_matches
# Update the loop vars
if (length(matches) > 0) {
available_names <- available_names[-matches]
}
matched_names <- c(matched_names, name)
matched_numbers <- c(matched_numbers, true_matches)
if (length(var_data[[name]][["location"]]) == 0 &
!force_match) {
# Remove variable as it is not available
var_data[[name]] <- NULL
} else if (length(var_data[[name]][["location"]]) !=
var_data[[name]][["no_rows"]]) {
warning(
"Expected the variable '", name, "'",
" to contain '", var_data[[name]][["no_rows"]], "' no. rows",
" but got '", length(var_data[[name]][["location"]]), "' no. rows."
)
var_data[[name]][["no_rows"]] <- length(var_data[[name]][["location"]])
}
}
return(var_data)
}
#' Runs an \code{\link[Gmisc]{fastDoCall}()} within the environment of the model
#'
#' Sometimes the function can't find some of the variables that
#' were available when running the original variable. This function
#' uses the \code{\link[stats:formula]{as.formula}()} together with
#' \code{\link[base]{environment}()} in order to get the environment
#' that the original code used.
#'
#' @param model The model used
#' @param what The function or non-empty character string used for
#' \code{\link[Gmisc]{fastDoCall}()}
#' @param ... Additional arguments passed to the function
#' @keywords internal
prEnvModelCall <- function(model, what, ...) {
call_lst <- list(object = model)
dots <- list(...)
if (length(dots) > 0) {
for (i in 1:length(dots)) {
if (!is.null(names(dots)[i])) {
call_lst[[names(dots)[i]]] <- dots[[i]]
} else {
call_lst <- c(
call_lst,
dots[[i]]
)
}
}
}
model_env <- new.env(parent = environment(as.formula(model)))
model_env$what <- what
model_env$call_lst <- call_lst
fastDoCall(what, call_lst,
envir = model_env
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.