R/validation.R
In csmpv: Biomarker Confirmation, Selection, Modelling, Prediction, and Validation

Documented in validation

#' Validate Model Predictions
#' 
#' @description This function is designed to perform model validation when the corresponding outcome variable is available. It facilitates the comparison of model predictions
#' with the provided outcome variable, which can be continuous, binary, or time-to-event.
#' 
#' @details This function is invoked by multiple prediction functions within this package when an outcome variable is available for a new dataset. 
#' However, users can also directly call this function if needed. 
#' 
#' @param predicted A vector of model prediction values, which can be generated by prediction functions in this package such as LASSO2_predict and XGBtraining_predict. 
#' For continuous outcomes, this vector represents model scores; for binary outcomes, it represents the probability of the positive class; 
#' for time-to-event outcomes, it contains risk scores, which will later be transformed into estimated survival probabilities corresponding to times in the new data.
#' 
#' @param outcomeType Outcome variable type. There are three choices: "binary" (default), "continuous", and "time-to-event".
#' @param trueY A vector of the outcome variable when it is continuous or binary.
#' @param time A vector of time for time-to-event outcome.
#' @param trueEvent A vector of the event for time-to-event outcome.
#' @param baseHz A table for accumulating baseline hazard for multiple time points, usually generated based on a training data set. 
#' @param u A single numeric follow-up time for survival outcomes.
#' @param outfile A string for the output file, including the path if necessary but without the file type extension. 

#' @import rms
#' @import survival
#' @import Hmisc
#' @importFrom utils write.csv
#' @importFrom grDevices dev.off
#' @importFrom grDevices pdf
#' 
#' @return A vector of model prediction values from the input
#' @references 
#'   Harrell Jr F (2023). rms: Regression Modeling Strategies_. R package version 6.7-1, <https://CRAN.R-project.org/package=rms>
#'   
#'   Harrell Jr F (2023). Hmisc: Harrell Miscellaneous_. R package version 5.1-1, <https://CRAN.R-project.org/package=Hmisc>
#'   
#' @examples
#' # Load in data sets:
#' data("datlist", package = "csmpv")
#' tdat = datlist$training
#' vdat = datlist$validation
#' 
#' # The function saves files locally. You can define your own temporary directory. 
#' # If not, tempdir() can be used to get the system's temporary directory.
#' temp_dir = tempdir()
#' # As an example, let's define Xvars, which will be used later:
#' Xvars = c("highIPI", "B.Symptoms", "MYC.IHC", "BCL2.IHC", "CD10.IHC", "BCL6.IHC")
#'
#' # The function can work with multiple models and multiple outcome types. 
#' # Here, we use XGBoost model with binary outcome as an example:
#' bxfit = XGBtraining(data = tdat, biomks = Xvars, Y = "DZsig",
#'                     outfile = paste0(temp_dir, "/binary_XGBoost"))
#' testdat = vdat[,bxfit$XGBoost_model$feature_names]
#' test = xgboost::xgb.DMatrix(data.matrix(testdat))
#' scores = stats::predict(bxfit$XGBoost_model, test) 
#' names(scores) = rownames(vdat)
#' Y = bxfit$Y
#' outs = validation(predicted = scores, outcomeType = "binary", trueY = vdat[,Y],
#'                   outfile = paste0(temp_dir, "/binary_XGBoost_validate")) 
#' # You might save the files to the directory you want.
#' 
#' # To delete the "temp_dir", use the following:
#' unlink(temp_dir)

#' @export

validation = function(predicted =NULL, outcomeType = c("binary","continuous","time-to-event"), trueY = NULL, time = NULL, trueEvent = NULL, 
                      baseHz = NULL,u = 2, outfile = "nameWithPath"){
  if(outcomeType == "continuous"){
    newplot = validate_continuous(yhat = predicted, yobs = trueY)
    pdf(paste0(outfile, "_newData_validation.pdf"))
    print(newplot)
    dev.off()
    print(newplot)

  }else if(outcomeType == "binary"){
    minV = min(c(predicted, trueY))
    maxV = max(c(predicted, trueY))
    # newplot = CalibrationCurves::val.prob.ci.2(predicted, trueY, CL.smooth = TRUE, logistic.cal = TRUE, lty.log = 9,
    #                         col.log = "red", lwd.log = 1.5, col.ideal = colors()[10], lwd.ideal = 0.5,
    #                         xlim = c(minV, maxV), ylim = c(minV, maxV))
    pdf(paste0(outfile, "_newData_validation.pdf"))
    rms::val.prob(predicted, trueY)
    dev.off()
  
    rms::val.prob(predicted, trueY)
    
  }else if(outcomeType == "time-to-event"){
    ## for this part, I do need baseHz to get survival probability
    
    ## notice that predicted here is risk score, or exp(linear model score)
    ## in order to get surv: exp(-bestMatchedTimeCumHaz  * exp(linear model score))
    ## the trick is how to get matched CumHaz
    
    # Calculate absolute differences
    diffs = abs(baseHz[, "time"] - u)
    
    # Find rows where X - u > 0
    filtered_rows = which(baseHz[, "time"] - u > 0)
    
    # Find the index with minimum difference
    min_diff_index = filtered_rows[which.min(diffs[filtered_rows])]
    
    cumhaz = baseHz[min_diff_index,"hazard"]
    
    ## in order to get surv: exp(-bestMatchedTimeCumHaz  * exp(linear model score)), should do more
    ss = exp(-cumhaz * predicted) ## predicted is risk score, ie: exp(linear model score)
    survs = survival::Surv(time = time, event = trueEvent)
    cindex = Hmisc::rcorr.cens(x=ss, S = survs) 
    write.csv(cindex, paste0(outfile, "_newData_Cindex.csv"))
    newsurv = rms::val.surv(S = survs, est.surv = ss)  
    pdf(paste0(outfile, "_newData_validation.pdf"))
    plot(newsurv, xlab = "Predicted Probability", 
         ylab = "Actual Probability",
         main = paste0("C-index: ", format(cindex[1], digits = 4)))
    dev.off()
    plot(newsurv, xlab = "Predicted Probability", 
              ylab = "Actual Probability",
              main = paste0("C-index: ", format(cindex[1], digits = 4)))

  }

  return(predicted) 
  
}