R/utils.R

Defines functions validate_df_columns count_proportion best_model

Documented in best_model

#' Find best-fitted model
#'
#' This utility function used by the 'rwToT' package returns the best-fitted
#' parametric distribution model for the given Survival/Treatment data. This
#' function uses `flexsurv` package for parametric modeling for time-to-event
#' data. Parametric model using built-in distribution such as Weibull(scale),
#' Gamma(rate), Exponential(rate), Log-logistic(scale), Log-normal(meanlog),
#' and Gompertz(rate) gets evaluated and best model is selected based on lowest
#' AIC value. The function also returns the restricted mean of the KM curve up
#' to chosen time \code{t}.
#'
#' @param df_cohort A dataframe. Minimum required columns:
#' \itemize{
#' \item{`STATUS` : Censor status (0 = censored, 1 = event)}
#' \item{`DD` : The follow up time }}
#'
#' @param t A numeric value: the time point of interest
#' @return Best-fitted model 'output_model' and restricted mean of the KM curve
#' up to chosen time \code{t}
#' @examples
#' # Find best-fitted model for AML dataset.
#' best_model(rwToT_aml,20)
#' @export
#'
#'
best_model <- function(df_cohort, t) {
  
  #creates survival time object and adds to dataframe
  df_cohort <- df_cohort %>%
    mutate(SurvObj = Surv(DD, STATUS))
  
  #flexible parametric models fitted with different distributions
  fit_model <- function(model_dist){
    model <- tryCatch(flexsurvreg(SurvObj ~ 1, data = df_cohort, dist = model_dist),
                      error = function(e) NA)
    invisible(model)
  }
  
  model <-  c("Exp", "Weibull", "LogNorm", "LogLogist", "Gompertz")
  
  model_dist <-  c("exponential", "weibull", "lnorm", "llogis", "gompertz")
  
  list_of_model <- setNames(map( {model_dist}, .f = fit_model), model)
  
  model_aic <- purrr::map_dbl(paste0("list_of_model$", {model},"$AIC"), function(i) tryCatch(eval(parse(text=i)), 
                                                                                             error = function(e) NA))
  
  
  list2env(list_of_model,globalenv())
  
  model_sum <- as.data.frame(cbind(model, model_aic), stringsAsFactors = FALSE)
  names(model_sum) <- c("type", "AIC")
  
  #best model with min AIC value
  best_model <- model_sum %>%
    dplyr::filter(!is.na(as.numeric(AIC))) %>%
    dplyr::filter(as.numeric(AIC)==min(as.numeric(AIC), na.rm=TRUE)) %>%
    dplyr::select(c(type))
  
  #best model, restricted mean for output table
  if (best_model == "Exp") {
    output_model <- Exp
    mean_predicted <- round(rmst_exp(t, rate = Exp$res[1, 1], start = 0), 1)
    mean_u95 <- round(rmst_exp(t, rate = Exp$res[1, 2], start = 0), 1)
    mean_l95 <- round(rmst_exp(t, rate = Exp$res[1, 3], start = 0), 1)
    mean_sum <- paste(mean_predicted, " (", mean_l95, " - ", mean_u95, ")", sep = "")
    best_model_rmst_mean <- list(mean_sum, "Exp")
  } else if (best_model == "Weibull") {
    output_model <- Weibull
    mean_predicted <- round(rmst_weibull(t, shape = Weibull$res[1, 1], scale = Weibull$res[2, 1], start = 0), 1)
    mean_l95 <- round(rmst_weibull(t, shape = Weibull$res[1, 2], scale = Weibull$res[2, 2], start = 0), 1)
    mean_u95 <- round(rmst_weibull(t, shape = Weibull$res[1, 3], scale = Weibull$res[2, 3], start = 0), 1)
    mean_sum <- paste(mean_predicted, " (", mean_l95, " - ", mean_u95, ")", sep = "")
    best_model_rmst_mean <- list(mean_sum, "Weibull")
  } else if (best_model == "LogNorm") {
    output_model <- LogNorm
    mean_predicted <- round(rmst_lnorm(t, meanlog = LogNorm$res[1, 1], sdlog = LogNorm$res[2, 1], start = 0), 1)
    mean_l95 <- round(rmst_lnorm(t, meanlog = LogNorm$res[1, 2], sdlog = LogNorm$res[2, 2], start = 0), 1)
    mean_u95 <- round(rmst_lnorm(t, meanlog = LogNorm$res[1, 3], sdlog = LogNorm$res[2, 3], start = 0), 1)
    mean_sum <- paste(mean_predicted, " (", mean_l95, " - ", mean_u95, ")", sep = "")
    best_model_rmst_mean <- list(mean_sum, "LogNorm")
  } else if (best_model == "LogLogist") {
    output_model <- LogLogist
    mean_predicted <- round(rmst_llogis(t, shape = LogLogist$res[1, 1], scale = LogLogist$res[2, 1], start = 0), 1)
    mean_l95 <- round(rmst_llogis(t, shape = LogLogist$res[1, 2], scale = LogLogist$res[2, 2], start = 0), 1)
    mean_u95 <- round(rmst_llogis(t, shape = LogLogist$res[1, 3], scale = LogLogist$res[2, 3], start = 0), 1)
    mean_sum <- paste(mean_predicted, " (", mean_l95, " - ", mean_u95, ")", sep = "")
    best_model_rmst_mean <- list(mean_sum, "LogLogist")
  } else if (best_model == "Gompertz") {
    output_model <- Gompertz
    mean_predicted <- round(rmst_gompertz(t, shape = Gompertz$res[1, 1], rate = Gompertz$res[2, 1], start = 0), 1)
    mean_u95 <- round(rmst_gompertz(t, shape = Gompertz$res[1, 2], rate = Gompertz$res[2, 2], start = 0), 1)
    mean_l95 <- round(rmst_gompertz(t, shape = Gompertz$res[1, 3], rate = Gompertz$res[2, 3], start = 0), 1)
    mean_sum <- paste(mean_predicted, " (", mean_l95, " - ", mean_u95, ")", sep = "")
    best_model_rmst_mean <- list(mean_sum, "Gompertz")
  }
  return(list("output_model" = output_model,
              "best_model_rmst_mean" = best_model_rmst_mean))
}


# Count Proportion
#
# Utility function that returns the count and proportion of Censor data.
count_proportion <- function(x) {
  stat_count <- table(x$STATUS)
  if(length(stat_count)<2){
    if(!is.na(stat_count['1'])){
      stat_count['0'] = 0
    }else {
      stat_count['1'] = 0
    }
  }
  stat_prop <- round((stat_count / sum(stat_count)) * 100, 1)
  return(paste(stat_count, " (", stat_prop, "%)", sep = ""))
}

# Dataframe Validation
#
# Validates the input dataframe to check required columns to run the model
validate_df_columns <- function(df_cohort) {
  columns <- c("STATUS", "DD", "START_DATE", "LAST_ACTIVITY_DATE")
  if (!is.data.frame(df_cohort)) {
    stop(paste("Argument", deparse(substitute(df_cohort)), "must be a data.frame."))
  }
  if (!all(i <- rlang::has_name(df_cohort, columns)))
    stop(sprintf(
      "%s doesn't contain: %s",
      deparse(substitute(df_cohort)),
      paste(columns[!i], collapse = ", ")))
}
sutsabs/rwToT2 documentation built on Feb. 18, 2022, 2:30 a.m.