R/AutoScore.R

Defines functions organize_performance plot_roc_curve compute_auc_val compute_score_table conversion_table print_roc_performance compute_multi_variable_table compute_uni_variable_table AutoScore_testing AutoScore_fine_tuning AutoScore_weighting AutoScore_parsimony AutoScore_rank

Documented in AutoScore_fine_tuning AutoScore_parsimony AutoScore_rank AutoScore_testing AutoScore_weighting compute_auc_val compute_multi_variable_table compute_score_table compute_uni_variable_table conversion_table plot_roc_curve print_roc_performance

# Pipeline_function --------------------------------------------------------

#' @title AutoScore STEP(i): Rank variables with machine learning (AutoScore Module 1)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data to be analyzed, only for auc-based ranking.
#' @param method method for ranking. Options: 1. `rf` - random forest (default), 2. `auc` - auc-based (required validation set). For "auc", univariate models will be built based on the train set, and the variable ranking is constructed via the AUC performance of corresponding univariate models on the validation set (`validation_set`).
#' @param ntree Number of trees in the random forest (Default: 100).
#' @details The first step in the AutoScore framework is variable ranking. We use random forest (RF),
#' an ensemble machine learning algorithm, to identify the top-ranking predictors for subsequent score generation.
#' This step correspond to Module 1 in the AutoScore paper.
#' @return Returns a vector containing the list of variables and its ranking generated by machine learning (random forest)
#' @examples
#' # see AutoScore Guidebook for the whole 5-step workflow
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' ranking <- AutoScore_rank(sample_data, ntree = 50)
#' @references
#' \itemize{
#'  \item{Breiman, L. (2001), Random Forests, Machine Learning 45(1), 5-32}
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @importFrom randomForest randomForest importance
#'
AutoScore_rank <- function(train_set, validation_set = NULL, method = "rf", ntree = 100) {
  # set.seed(4)
  if (method == "rf") {
    train_set <- AutoScore_impute(train_set)
    train_set$label <- as.factor(train_set$label)
    model <-
      randomForest::randomForest(label ~ .,
                                 data = train_set,
                                 ntree = ntree,
                                 preProcess = "scale"
      )

    # estimate variable importance
    importance <- randomForest::importance(model, scale = F)

    # summarize importance
    names(importance) <- rownames(importance)
    importance <- sort(importance, decreasing = T)
    cat("The ranking based on variable importance was shown below for each variable: \n")
    print(importance)
    plot_importance(importance)
    return(importance)
  }


  if (method == "auc") {
    if (is.null(validation_set)) {
      stop("Error: Please specify the validation set","\n",call.=FALSE)
    }
    imputation <- AutoScore_impute(train_set, validation_set)
    train_set <- imputation$train
    validation_set <- imputation$validation

    vars <- names(train_set)
    vars <- vars[vars != "label"]
    train_set$label <- as.factor(train_set$label)
    AUC <- rep(0, length(vars))

    for (i in 1:length(vars)) {
      # log <- sprintf("--------%s--------", vars[i])
      # print(log)
      if (length(unique(train_set[[vars[i]]])) > 1) {
        model <-
          glm(label ~ ., data = train_set[c("label", vars[i])], family = binomial(link = "logit"))
        pred <- predict(model, newdata = validation_set[c("label", vars[i])])
        # confusionMatrix(pred, as.factor(as.character(val_set$label)))
        roc_obj <- roc(response = validation_set$label, predictor = as.numeric(pred), quiet = TRUE)
        AUC[i] <- auc(roc_obj)[[1]]
      } else {
        # if model can't be built
        AUC[i] = 0
      }
    }

    # summarize importance (AUC)
    names(AUC) <- vars
    AUC <- sort(AUC, decreasing = T)
    cat("The auc-based ranking based on variable importance was shown below for each variable: \n")
    print(AUC)
    plot_importance(AUC)
    return(AUC)
  }
  else {
    warning("Please specify methods among available options: rf, auc\n")
  }
}


#' @title AutoScore STEP(ii): Select the best model with parsimony plot (AutoScore Modules 2+3+4)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param rank the raking result generated from AutoScore STEP(i) \code{\link{AutoScore_rank}}
#' @param n_min Minimum number of selected variables (Default: 1).
#' @param n_max Maximum number of selected variables (Default: 20).
#' @param max_score Maximum total score (Default: 100).
#' @param cross_validation If set to \code{TRUE}, cross-validation would be used for generating parsimony plot, which is
#'   suitable for small-size data. Default to \code{FALSE}
#' @param fold The number of folds used in cross validation (Default: 10). Available if \code{cross_validation = TRUE}.
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans" (Default: "quantile").
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. (Default: c(0, 0.05, 0.2, 0.8, 0.95, 1)) Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @param do_trace If set to TRUE, all results based on each fold of cross-validation would be printed out and plotted (Default: FALSE). Available if \code{cross_validation = TRUE}.
#' @param auc_lim_min Min y_axis limit in the parsimony plot (Default: 0.5).
#' @param auc_lim_max Max y_axis limit in the parsimony plot (Default: "adaptive").
#' @details This is the second step of the general AutoScore workflow, to generate the parsimony plot to help select a parsimonious model.
#'  In this step, it goes through AutoScore Module 2,3 and 4 multiple times and to evaluate the performance under different variable list.
#'  The generated parsimony plot would give researcher an intuitive figure to choose the best models.
#'  If data size is small (ie, <5000), an independent validation set may not be a wise choice. Then, we suggest using cross-validation
#'  to maximize the utility of data. Set \code{cross_validation=TRUE}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @return List of AUC value for different number of variables
#' @examples
#' \donttest{
#' # see AutoScore Guidebook for the whole 5-step workflow
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' out_split <- split_data(data = sample_data, ratio = c(0.7, 0.1, 0.2))
#' train_set <- out_split$train_set
#' validation_set <- out_split$validation_set
#' ranking <- AutoScore_rank(train_set, ntree=100)
#' AUC <- AutoScore_parsimony(
#' train_set,
#' validation_set,
#' rank = ranking,
#' max_score = 100,
#' n_min = 1,
#' n_max = 20,
#' categorize = "quantile",
#' quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1)
#' )}
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N, AutoScore: A Machine Learning-Based Automatic Clinical
#'   Score Generator and Its Application to Mortality Prediction Using Electronic Health Records,
#'   JMIR Med Inform 2020;8(10):e21798, doi: 10.2196/21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import  pROC
AutoScore_parsimony <-
  function(train_set,
           validation_set,
           rank,
           max_score = 100,
           n_min = 1,
           n_max = 20,
           cross_validation = FALSE,
           fold = 10,
           categorize = "quantile",
           quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1),
           max_cluster = 5,
           do_trace = FALSE,
           auc_lim_min = 0.5,
           auc_lim_max = "adaptive") {
    if (n_max > length(rank)) {
      warning(
        "WARNING: the n_max (",
        n_max,
        ") is larger the number of all variables (",
        length(rank),
        "). We Automatically revise the n_max to ",
        length(rank)
      )
      n_max <- length(rank)
    }
    # Cross Validation scenario
    if (cross_validation == TRUE) {
      # Divide the data equally into n fold, record its index number
      #set.seed(4)
      index <- list()
      all <- 1:length(train_set[, 1])
      for (i in 1:(fold - 1)) {
        a <- sample(all, trunc(length(train_set[, 1]) / fold))
        index <- append(index, list(a))
        all <- all[!(all %in% a)]
      }
      index <- c(index, list(all))

      # Create a new variable auc_set to store all AUC value during the cross-validation
      auc_set <- data.frame(rep(0, n_max - n_min + 1))

      # for each fold, generate train_set and validation_set
      for (j in 1:fold) {
        validation_set_temp <- train_set[index[[j]],]
        train_set_tmp <- train_set[-index[[j]],]

        #variable_list <- names(rank)
        AUC <- c()

        # Go through AUtoScore Module 2/3/4 in the loop
        for (i in n_min:n_max) {
          variable_list <- names(rank)[1:i]
          train_set_1 <- train_set_tmp[, c(variable_list, "label")]
          validation_set_1 <-
            validation_set_temp[, c(variable_list, "label")]

          model_roc <-
            compute_auc_val(
              train_set_1,
              validation_set_1,
              variable_list,
              categorize,
              quantiles,
              max_cluster,
              max_score
            )
          #print(auc(model_roc))
          AUC <- c(AUC, auc(model_roc))
        }

        # plot parsimony plot for each fold
        names(AUC) <- n_min:n_max

        # only print and plot when do_trace = TRUE
        if (do_trace) {
          print(paste("list of AUC values for fold", j))
          print(data.frame(AUC))
          plot(
            AUC,
            main = paste("Parsimony plot (cross validation) for fold", j),
            xlab = "Number of Variables",
            ylab = "Area Under the Curve",
            col = "#2b8cbe",
            lwd = 2,
            type = "o"
          )
        }

        # store AUC result from each fold into "auc_set"
        auc_set <- cbind(auc_set, data.frame(AUC))
      }

      # finish loop and then output final results averaged by all folds
      auc_set$rep.0..n_max...n_min...1. <- NULL
      auc_set$sum <- rowSums(auc_set) / fold
      cat("***list of final mean AUC values through cross-validation are shown below \n")
      print(data.frame(auc_set$sum))

      # output final results and plot parsimony plot
      plot(plot_auc(AUC = auc_set$sum, variables = names(rank)[n_min:n_max],
                    num = n_min:n_max,
                    auc_lim_min = auc_lim_min, auc_lim_max = auc_lim_max,
                    ylab = "Area Under the Curve",
                    title = paste0("Final Parsimony plot based on ", fold,
                                   "-fold Cross Validation")))

      return(auc_set)
    }


    # if Cross validation is FALSE
    else{
      AUC <- c()

      # Go through AutoScore Module 2/3/4 in the loop
      for (i in n_min:n_max) {
        cat(paste("Select", i, "Variable(s):  "))

        variable_list <- names(rank)[1:i]
        train_set_1 <- train_set[, c(variable_list, "label")]
        validation_set_1 <-
          validation_set[, c(variable_list, "label")]
        model_roc <-
          compute_auc_val(
            train_set_1,
            validation_set_1,
            variable_list,
            categorize,
            quantiles,
            max_cluster,
            max_score
          )
        print(auc(model_roc))
        AUC <- c(AUC, auc(model_roc))
      }

      plot(plot_auc(AUC = AUC, variables = names(rank)[n_min:n_max],
                    num = n_min:n_max,
                    auc_lim_min = auc_lim_min, auc_lim_max = auc_lim_max,
                    ylab = "Area Under the Curve",
                    title = "Parsimony plot on the validation set"))
      names(AUC) <- names(rank)[n_min:n_max]

      return(AUC)
    }
  }


#' @title AutoScore STEP(iii): Generate the initial score with the final list of variables (Re-run AutoScore Modules 2+3)
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii)\code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param max_score Maximum total score (Default: 100).
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans" (Default: "quantile").
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. (Default: c(0, 0.05, 0.2, 0.8, 0.95, 1)) Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @param metrics_ci whether to calculate confidence interval for the metrics of sensitivity, specificity, etc.
#' @return Generated \code{cut_vec} for downstream fine-tuning process STEP(iv) \code{\link{AutoScore_fine_tuning}}.
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{AutoScore_testing}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_weighting <-
  function(train_set,
           validation_set,
           final_variables,
           max_score = 100,
           categorize = "quantile",
           max_cluster = 5,
           quantiles = c(0, 0.05, 0.2, 0.8, 0.95, 1),
           metrics_ci = FALSE) {
    # prepare train_set and Validation Set
    cat("****Included Variables: \n")
    print(data.frame(variable_name = final_variables))
    train_set_1 <- train_set[, c(final_variables, "label")]
    validation_set_1 <-
      validation_set[, c(final_variables, "label")]

    # AutoScore Module 2 : cut numeric and transfer categories and generate "cut_vec"
    cut_vec <-
      get_cut_vec(
        train_set_1,
        categorize = categorize,
        quantiles = quantiles,
        max_cluster = max_cluster
      )
    train_set_2 <- transform_df_fixed(train_set_1, cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec)

    # AutoScore Module 3 : Score weighting
    score_table <-
      compute_score_table(train_set_2, max_score, final_variables)
    cat("****Initial Scores: \n")
    #print(as.data.frame(score_table))
    print_scoring_table(scoring_table = score_table, final_variable = final_variables)

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"]))
    y_validation <- validation_set_3$label

    # Intermediate evaluation based on Validation Set
    plot_roc_curve(validation_set_3$total_score, as.numeric(y_validation) - 1)
    cat("***Performance (based on validation set):\n")
    print_roc_performance(y_validation, validation_set_3$total_score, threshold = "best", metrics_ci = metrics_ci)
    cat(
      "***The cutoffs of each variable generated by the AutoScore are saved in cut_vec. You can decide whether to revise or fine-tune them \n"
    )
    #print(cut_vec)
    return(cut_vec)
  }


#' @title AutoScore STEP(iv): Fine-tune the score by revising cut_vec with domain knowledge (AutoScore Module 5)
#' @description Domain knowledge is essential in guiding risk model development.
#'  For continuous variables, the variable transformation is a data-driven process (based on "quantile" or "kmeans" ).
#'  In this step, the automatically generated cutoff values for each continuous variable can be fine-tuned
#'  by combining, rounding, and adjusting according to the standard clinical norm.  Revised \code{cut_vec} will be input with domain knowledge to
#' update scoring table. User can choose any cut-off values/any number of categories. Then final Scoring table will be generated. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param train_set A processed \code{data.frame} that contains data to be analyzed, for training.
#' @param validation_set A processed \code{data.frame} that contains data for validation purpose.
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii) \code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param max_score Maximum total score (Default: 100).
#' @param cut_vec Generated from STEP(iii) \code{\link{AutoScore_weighting}}.Please follow the guidebook
#' @param metrics_ci whether to calculate confidence interval for the metrics of sensitivity, specificity, etc.
#' @return Generated final table of scoring model for downstream testing
#' @examples
#' ## Please see the guidebook or vignettes
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_testing}},Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_fine_tuning <-
  function(train_set,
           validation_set,
           final_variables,
           cut_vec,
           max_score = 100,
           metrics_ci = FALSE) {
    # Prepare train_set and Validation Set
    train_set_1 <- train_set[, c(final_variables, "label")]
    validation_set_1 <-
      validation_set[, c(final_variables, "label")]

    # AutoScore Module 2 : cut numeric and transfer categories (based on fix "cut_vec" vector)
    train_set_2 <-
      transform_df_fixed(train_set_1, cut_vec = cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec = cut_vec)

    # AutoScore Module 3 : Score weighting
    score_table <-
      compute_score_table(train_set_2, max_score, final_variables)
    cat("***Fine-tuned Scores: \n")
    #print(as.data.frame(score_table))
    print_scoring_table(scoring_table = score_table, final_variable = final_variables)

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"])) ## which name ="label"
    y_validation <- validation_set_3$label

    # Intermediate evaluation based on Validation Set after fine-tuning
    plot_roc_curve(validation_set_3$total_score, as.numeric(y_validation) - 1)
    cat("***Performance (based on validation set, after fine-tuning):\n")
    print_roc_performance(y_validation, validation_set_3$total_score, threshold = "best", metrics_ci = metrics_ci)
    return(score_table)
  }


#' @title AutoScore STEP(v): Evaluate the final score with ROC analysis (AutoScore Module 6)
#' @param test_set A processed \code{data.frame} that contains data for testing purpose. This \code{data.frame} should have same format as
#'        \code{train_set} (same variable names and outcomes)
#' @param final_variables A vector containing the list of selected variables, selected from Step(ii) \code{\link{AutoScore_parsimony}}. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @param scoring_table The final scoring table after fine-tuning, generated from STEP(iv) \code{\link{AutoScore_fine_tuning}}.Please follow the guidebook
#' @param cut_vec Generated from STEP(iii) \code{\link{AutoScore_weighting}}.Please follow the guidebook
#' @param threshold Score threshold for the ROC analysis to generate sensitivity, specificity, etc. If set to "best", the optimal threshold will be calculated (Default:"best").
#' @param with_label Set to TRUE if there are labels in the test_set and performance will be evaluated accordingly (Default:TRUE).
#' Set it to "FALSE" if there are not "label" in the "test_set" and the final predicted scores will be the output without performance evaluation.
#' @param metrics_ci whether to calculate confidence interval for the metrics of sensitivity, specificity, etc.
#' @return A data frame with predicted score and the outcome for downstream visualization.
#' @examples
#' ## Please see the guidebook or vignettes
#' @references
#' \itemize{
#'  \item{Xie F, Chakraborty B, Ong MEH, Goldstein BA, Liu N. AutoScore: A Machine Learning-Based Automatic Clinical Score Generator and
#'   Its Application to Mortality Prediction Using Electronic Health Records. JMIR Medical Informatics 2020;8(10):e21798}
#' }
#' @seealso \code{\link{AutoScore_rank}}, \code{\link{AutoScore_parsimony}}, \code{\link{AutoScore_weighting}}, \code{\link{AutoScore_fine_tuning}}, \code{\link{print_roc_performance}}, Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#' @export
#' @import pROC ggplot2
AutoScore_testing <-
  function(test_set,
           final_variables,
           cut_vec,
           scoring_table,
           threshold = "best",
           with_label = TRUE,
           metrics_ci = TRUE) {
    if (with_label) {
      # prepare test set: categorization and "assign_score"
      test_set_1 <- test_set[, c(final_variables, "label")]
      test_set_2 <-
        transform_df_fixed(test_set_1, cut_vec = cut_vec)
      test_set_3 <- assign_score(test_set_2, scoring_table)
      test_set_3$total_score <-
        rowSums(subset(test_set_3, select = names(test_set_3)[names(test_set_3) !=
                                                                "label"]))
      test_set_3$total_score[which(is.na(test_set_3$total_score))] <-
        0
      y_test <- test_set_3$label

      # Final evaluation based on testing set
      plot_roc_curve(test_set_3$total_score, as.numeric(y_test) - 1)
      cat("***Performance using AutoScore:\n")
      model_roc <- roc(y_test, test_set_3$total_score, quiet = T)
      print_roc_performance(y_test, test_set_3$total_score, threshold = threshold, metrics_ci = metrics_ci)
      #Modelprc <- pr.curve(test_set_3$total_score[which(y_test == 1)],test_set_3$total_score[which(y_test == 0)],curve = TRUE)
      #values<-coords(model_roc, "best", ret = c("specificity", "sensitivity", "accuracy", "npv", "ppv", "precision"), transpose = TRUE)
      pred_score <-
        data.frame(pred_score = test_set_3$total_score, Label = y_test)
      return(pred_score)

    } else {
      test_set_1 <- test_set[, c(final_variables)]
      test_set_2 <-
        transform_df_fixed(test_set_1, cut_vec = cut_vec)
      test_set_3 <- assign_score(test_set_2, scoring_table)
      test_set_3$total_score <-
        rowSums(subset(test_set_3, select = names(test_set_3)[names(test_set_3) !=
                                                                "label"]))
      test_set_3$total_score[which(is.na(test_set_3$total_score))] <-
        0
      pred_score <-
        data.frame(pred_score = test_set_3$total_score, Label = NA)
      return(pred_score)
    }
  }


# Direct_function ---------------------------------------------------------



#' @title AutoScore function: Univariable Analysis
#' @description Perform univariable analysis and generate the result table with odd ratios.
#' @param df data frame after checking
#' @return result of univariate analysis
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' uni_table<-compute_uni_variable_table(sample_data)
#' @export
compute_uni_variable_table <- function(df) {
  uni_table <- data.frame()
  for (i in names(df)[names(df) != "label"]) {
    model <-
      glm(
        as.formula("label ~ ."),
        data = subset(df, select = c("label", i)),
        family = binomial,
        na.action = na.omit
      )
    a <-
      cbind(exp(cbind(OR = coef(model), confint.default(model))), summary(model)$coef[, "Pr(>|z|)"])
    uni_table <- rbind(uni_table, a)
  }
  uni_table <-
    uni_table[!grepl("Intercept", row.names(uni_table), ignore.case = T), ]
  uni_table <- round(uni_table, digits = 3)
  uni_table$V4[uni_table$V4 < 0.001] <- "<0.001"
  uni_table$OR <-
    paste(uni_table$OR,
          "(",
          uni_table$`2.5 %`,
          "-",
          uni_table$`97.5 %`,
          ")",
          sep = "")
  uni_table$`2.5 %` <- NULL
  uni_table$`97.5 %` <- NULL
  names(uni_table)[names(uni_table) == "V4"] <- "p value"
  return(uni_table)
}


#' @title AutoScore function: Multivariate Analysis
#' @description Generate tables for multivariate analysis
#' @param df data frame after checking
#' @return result of the multivariate analysis
#' @examples
#' data("sample_data")
#' names(sample_data)[names(sample_data) == "Mortality_inpatient"] <- "label"
#' multi_table<-compute_multi_variable_table(sample_data)
#' @export
compute_multi_variable_table <- function(df) {
  model <-
    glm(label ~ .,
        data = df,
        family = binomial,
        na.action = na.omit)
  multi_table <-
    cbind(exp(cbind(
      adjusted_OR = coef(model), confint.default(model)
    )), summary(model)$coef[, "Pr(>|z|)"])
  multi_table <-
    multi_table[!grepl("Intercept", row.names(multi_table), ignore.case = T), ]
  multi_table <- round(multi_table, digits = 3)
  multi_table <- as.data.frame(multi_table)
  multi_table$V4[multi_table$V4 < 0.001] <- "<0.001"
  multi_table$adjusted_OR <-
    paste(
      multi_table$adjusted_OR,
      "(",
      multi_table$`2.5 %`,
      "-",
      multi_table$`97.5 %`,
      ")",
      sep = ""
    )
  multi_table$`2.5 %` <- NULL
  multi_table$`97.5 %` <- NULL
  names(multi_table)[names(multi_table) == "V4"] <- "p value"
  return(multi_table)
}



#' @title AutoScore function: Print receiver operating characteristic (ROC) performance
#' @description Print receiver operating characteristic (ROC) performance
#' @param label outcome variable
#' @param score predicted score
#' @param threshold Threshold for analyze sensitivity, specificity and other metrics. Default to "best"
#' @param metrics_ci whether to calculate confidence interval for the metrics of sensitivity, specificity, etc.
#' @seealso \code{\link{AutoScore_testing}}
#' @return No return value and the ROC performance will be printed out directly.
#' @export
#' @import pROC
print_roc_performance <-
  function(label, score, threshold = "best", metrics_ci = FALSE) {
    if (sum(is.na(score)) > 0)
      warning("NA in the score: ", sum(is.na(score)))
    model_roc <- roc(label, score, quiet = T)
    cat("AUC: ", round(auc(model_roc), 4), "  ")
    print(ci(model_roc))


    if (threshold == "best") {
      threshold <-
        ceiling(coords(model_roc, "best", ret = "threshold", transpose = TRUE))
      cat("Best score threshold: >=", threshold, "\n")
    } else {
      cat("Score threshold: >=", threshold, "\n")
    }
    cat("Other performance indicators based on this score threshold: \n")


    if(metrics_ci){
    roc <-
      ci.coords(
        model_roc,
        threshold ,
        ret = c("specificity", "sensitivity", "npv", "ppv"),
        transpose = TRUE
      )
    cat(
      "Sensitivity: ",
      round(roc$sensitivity[2], 4),
      " 95% CI: ",
      round(roc$sensitivity[1], 4),
      "-",
      round(roc$sensitivity[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "Specificity: ",
      round(roc$specificity[2], 4),
      " 95% CI: ",
      round(roc$specificity[1], 4),
      "-",
      round(roc$specificity[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "PPV:         ",
      round(roc$ppv[2], 4),
      " 95% CI: ",
      round(roc$ppv[1], 4),
      "-",
      round(roc$ppv[3], 4),
      "\n",
      sep = ""
    )
    cat(
      "NPV:         ",
      round(roc$npv[2], 4),
      " 95% CI: ",
      round(roc$npv[1], 4),
      "-",
      round(roc$npv[3], 4),
      "\n",
      sep = ""
    )}


    else{
      roc <-
      coords(
        model_roc,
        threshold ,
        ret = c("specificity", "sensitivity", "npv", "ppv"),
        transpose = TRUE
      )
    cat(
      "Sensitivity: ",
      round(roc[2], 4),
      "\n",
      sep = ""
    )
    cat(
      "Specificity: ",
      round(roc[1], 4),
      "\n",
      sep = ""
    )
    cat(
      "PPV:         ",
      round(roc[4], 4),
      "\n",
      sep = ""
    )
    cat(
      "NPV:         ",
      round(roc[3], 4),
      "\n",
      sep = ""
    )}
  }

#' @title AutoScore function: Print conversion table based on final performance evaluation
#' @description Print conversion table based on final performance evaluation
#' @param pred_score a vector with outcomes and final scores generated from \code{\link{AutoScore_testing}}
#' @param by specify correct method for categorizing the threshold:  by "risk" or "score".Default to "risk"
#' @param values A vector of threshold for analyze sensitivity, specificity and other metrics. Default to "c(0.01,0.05,0.1,0.2,0.5)"
#' @seealso \code{\link{AutoScore_testing}}
#' @return No return value and the conversion will be printed out directly.
#' @export
#' @import pROC knitr
conversion_table<-function(pred_score, by = "risk", values = c(0.01,0.05,0.1,0.2,0.5)){

  glmmodel<-glm(Label~pred_score,data = pred_score,family = binomial(link="logit"))
  pred_score$pred_risk<-predict(glmmodel,newdata=pred_score, type = "response")
  rtotoal<-data.frame(matrix(nrow=0,ncol=7))
  Modelroc<-roc(pred_score$Label,pred_score$pred_risk)

  if(by=="risk"){
    for(i in values){
      r<-data.frame(i)
      r$i<-paste(r$i*100,"%",sep="")
      names(r)[1]<-"Predicted Risk"
      r$`Score cut-off`<-paste("",min(pred_score[pred_score$pred_risk>=i,]$pred_score),sep="")
      r$`Percentage of patients (%)`<-round(length(pred_score[pred_score$pred_risk>=i,][,1])/length(pred_score[,1]),digits = 2)*100
      r1<-organize_performance(ci.coords(Modelroc,x=i ,input="threshold", ret=c("accuracy", "sensitivity","specificity" ,
                                                                                "ppv", "npv" )))
      #r1$X1<-NULL
      r<-cbind(r,r1)
      rtotoal<-rbind(rtotoal,r)
    }

    names(rtotoal)<-c("Predicted Risk [>=]", "Score cut-off [>=]", "Percentage of patients (%)","Accuracy (95% CI)",
                      "Sensitivity (95% CI)", "Specificity (95% CI)", "PPV (95% CI)",
                      "NPV (95% CI)")

  }else if(by=="score"){
    for(i in values){
      r<-data.frame(i)
      risk<-min(pred_score[pred_score$pred_score>=i,]$pred_risk)
      #risk<-round(risk,3)
      r$risk<-paste(round(risk,3)*100,"%",sep="")
      r$`Percentage of patients (%)`<-round(length(pred_score[pred_score$pred_risk>=risk,][,1])/length(pred_score[,1]),digits = 2)*100
      r1<-organize_performance(ci.coords(Modelroc,x=risk ,input="threshold", ret=c("accuracy", "sensitivity","specificity" ,
                                                                                   "ppv", "npv" )))
      #r1$X1<-NULL
      r<-cbind(r,r1)
      rtotoal<-rbind(rtotoal,r)
    }

    names(rtotoal)<-c("Score cut-off [>=]","Predicted Risk [>=]",  "Percentage of patients (%)","Accuracy (95% CI)",
                      "Sensitivity (95% CI)", "Specificity (95% CI)", "PPV (95% CI)",
                      "NPV (95% CI)")



  } else{ stop('ERROR: please specify correct method for categorizing the threshold:  by "risk" or "score".')}

  kable(rtotoal,align=c(rep('c',times=7)))

}


# Internal_function -------------------------------------------------------
## built-in function for AutoScore below
## Those functions are cited by pipeline functions

#' @title Internal function: Compute scoring table based on training dataset (AutoScore Module 3)
#' @description Compute scoring table based on training dataset
#' @param train_set_2 Processed training set after variable transformation (AutoScore Module 2)
#' @param max_score Maximum total score
#' @param variable_list List of included variables
#' @return A scoring table
compute_score_table <-
  function(train_set_2, max_score, variable_list) {
    #AutoScore Module 3 : Score weighting
    # First-step logistic regression
    model <-
      glm(label ~ ., family = binomial(link = "logit"), data = train_set_2)
    coef_vec <- coef(model)
    if (length(which(is.na(coef_vec))) > 0) {
      warning(" WARNING: GLM output contains NULL, Replace NULL with 1")
      coef_vec[which(is.na(coef_vec))] <- 1
    }
    train_set_2 <- change_reference(train_set_2, coef_vec)

    # Second-step logistic regression
    model <-
      glm(label ~ ., family = binomial(link = "logit"), data = train_set_2)
    coef_vec <- coef(model)
    if (length(which(is.na(coef_vec))) > 0) {
      warning(" WARNING: GLM output contains NULL, Replace NULL with 1")
      coef_vec[which(is.na(coef_vec))] <- 1
    }

    # rounding for final scoring table "score_table"
    coef_vec_tmp <- round(coef_vec / min(coef_vec[-1]))
    score_table <- add_baseline(train_set_2, coef_vec_tmp)

    # normalization according to "max_score" and regenerate score_table
    total_max <- max_score
    total <- 0
    for (i in 1:length(variable_list))
      total <-
      total + max(score_table[grepl(variable_list[i], names(score_table))])
    score_table <- round(score_table / (total / total_max))
    return(score_table)
  }


#' @title Internal function: Compute AUC based on validation set for plotting parsimony (AutoScore Module 4)
#' @description  Compute AUC based on validation set for plotting parsimony
#' @param train_set_1 Processed training set
#' @param validation_set_1 Processed validation set
#' @param max_score Maximum total score
#' @param variable_list List of included variables
#' @param categorize  Methods for categorize continuous variables. Options include "quantile" or "kmeans"
#' @param quantiles Predefined quantiles to convert continuous variables to categorical ones. Available if \code{categorize = "quantile"}.
#' @param max_cluster The max number of cluster (Default: 5). Available if \code{categorize = "kmeans"}.
#' @return A List of AUC for parsimony plot
compute_auc_val <-
  function(train_set_1,
           validation_set_1,
           variable_list,
           categorize,
           quantiles,
           max_cluster,
           max_score) {
    # AutoScore Module 2 : cut numeric and transfer categories
    cut_vec <-
      get_cut_vec(
        train_set_1,
        categorize = categorize,
        quantiles = quantiles,
        max_cluster = max_cluster
      )
    train_set_2 <- transform_df_fixed(train_set_1, cut_vec)
    validation_set_2 <-
      transform_df_fixed(validation_set_1, cut_vec)
    if (sum(is.na(validation_set_2)) > 0)
      warning("NA in the validation_set_2: ", sum(is.na(validation_set_2)))
    if (sum(is.na(train_set_2)) > 0)
      warning("NA in the train_set_2: ", sum(is.na(train_set_2)))

    # AutoScore Module 3 : Variable Weighting
    score_table <-
      compute_score_table(train_set_2, max_score, variable_list)
    if (sum(is.na(score_table)) > 0)
      warning("NA in the score_table: ", sum(is.na(score_table)))

    # Using "assign_score" to generate score based on new dataset and Scoring table "score_table"
    validation_set_3 <- assign_score(validation_set_2, score_table)
    if (sum(is.na(validation_set_3)) > 0)
      warning("NA in the validation_set_3: ", sum(is.na(validation_set_3)))

    validation_set_3$total_score <-
      rowSums(subset(validation_set_3, select = names(validation_set_3)[names(validation_set_3) !=
                                                                          "label"]))
    y_validation <- validation_set_3$label
    # plot_roc_curve(validation_set_3$total_score,as.numeric(y_validation)-1)

    # calculate AUC value
    model_roc <-
      roc(y_validation, validation_set_3$total_score, quiet = T)

    return(model_roc)
  }



#' @title Internal Function: Plotting ROC curve
#' @param prob Predicate probability
#' @param labels Actual outcome(binary)
#' @param quiet if set to TRUE, there will be no trace printing
#' @return No return value and the ROC curve will be plotted.
#' @import pROC
plot_roc_curve <- function(prob, labels, quiet = TRUE) {
  #library(pROC)
  # prob<-predict(model.glm,newdata=X_test, type = 'response')
  model_roc <- roc(labels, prob, quiet = quiet)
  auc <- auc(model_roc)
  auc_ci <- ci.auc(model_roc)

  roc.data <- data.frame(
    fpr = as.vector(coords(model_roc,
                           "local maximas", ret = "1-specificity", transpose = TRUE)),
    tpr = as.vector(coords(model_roc, "local maximas", ret = "sensitivity",
                           transpose = TRUE)))

  auc_ci <- sort(as.numeric(auc_ci)) # should include AUC and 95% CI
  clr <- rgb(red = 41, green = 70, blue = 76, maxColorValue = 255)
  clr_axis <- rgb(red = 25, green = 24, blue = 24, maxColorValue = 255)
  p<-ggplot(data.frame(fpr = roc.data$fpr, tpr = roc.data$tpr),
            aes_string(x = "fpr", ymin = 0, ymax = "tpr")) +
    #geom_ribbon(alpha = 0.2, fill = clr) +
    geom_line(aes_string(y = "tpr"), color = clr, lwd = 1.2) +
    geom_abline(slope = 1, intercept = 0, lty = 2, lwd = 0.3, color = clr_axis) +
    # scale_color_manual(values = ) +
    scale_x_continuous(expand = c(0, 0), limits = c(0, 1)) +
    scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
    labs(x = "1-Specificity", y = "Sensitivity",
         title = "Receiver Operating Characteristic Curve",
         subtitle = sprintf("AUC=%.3f, 95%% CI: %.3f-%.3f",
                            auc_ci[2], auc_ci[1], auc_ci[3])) +
    theme_classic() +
    theme(plot.margin = unit(c(0.5,0.5,0.5,0.5),"cm"),
          axis.text= element_text(size=12),
          text = element_text(size = 12, color = clr_axis),
          #text = element_text(family = "Tahoma", size = 12, color = clr_axis),
          # axis.line = element_line(color = clr_axis, size = 0.3),
          axis.line = element_blank(),
          axis.ticks.length = unit(0.15, units = "cm"),
          panel.border = element_rect(color = clr_axis, fill = NA))
  print(p)
}


organize_performance<-function(w1){
  df <- data.frame(w1)
  df <- round(df, digits = 3)*100
  df1 <- data.frame(1)
  df1 <- data.frame(`Accuracy (95% CI)`=paste(df$accuracy.50.,"% (",df$accuracy.2.5.,"-",df$accuracy.97.5.,"%)",sep = ""),
                    `Sensitivity (95% CI)`=paste(df$sensitivity.50.,"% (",df$sensitivity.2.5.,"-",df$sensitivity.97.5.,"%)",sep = ""),
                    `Specificity (95% CI)`=paste(df$specificity.50.,"% (",df$specificity.2.5.,"-",df$specificity.97.5.,"%)",sep = ""),
                    `PPV (95% CI)`=paste(df$ppv.50.,"% (",df$ppv.2.5.,"-",df$ppv.97.5.,"%)",sep = ""),
                    `NPV (95% CI)`=paste(df$npv.50.,"% (",df$npv.2.5.,"-",df$npv.97.5.,"%)",sep = ""),
                    check.names = FALSE)

  #print(df1)
  return(df1)
}




#' 20000 simulated ICU admission data, with the same distribution as the data in the MIMIC-III ICU database
#'
#' @description 20000 simulated samples, with the same distribution as the data in the MIMIC-III ICU database. It is used for demonstration only in the Guidebook. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#'  \itemize{
#'  \item{Johnson, A., Pollard, T., Shen, L. et al. MIMIC-III, a freely accessible critical care database. Sci Data 3, 160035 (2016).}
#' }
"sample_data"


#' 1000 simulated ICU admission data, with the same distribution as the data in the MIMIC-III ICU database
#'
#' @description 1000 simulated samples, with the same distribution as the data in the MIMIC-III ICU database. It is used for demonstration only in the Guidebook. Run \code{vignette("Guide_book", package = "AutoScore")} to see the guidebook or vignette.
#'  \itemize{
#'  \item{Johnson, A., Pollard, T., Shen, L. et al. MIMIC-III, a freely accessible critical care database. Sci Data 3, 160035 (2016).}
#' }
"sample_data_small"


#' 20000 simulated ICU admission data with missing values
#'
#' @description 20000 simulated samples with missing values, which can be used for demostrating AutoScore workflow dealing with missing values.
#'  \itemize{
#'  \item{Johnson, A., Pollard, T., Shen, L. et al. MIMIC-III, a freely accessible critical care database. Sci Data 3, 160035 (2016).}
#' }
"sample_data_with_missing"

Try the AutoScore package in your browser

Any scripts or data that you put into this service are public.

AutoScore documentation built on Oct. 16, 2022, 1:06 a.m.