R/SklearnClassifier.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

Documented in computeGridPerformance

# @file SklearnClassifier.R
#
# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

fitSklearn <- function(trainData,
                       modelSettings,
                       search = "grid",
                       analysisId,
                       ...) {
  param <- modelSettings$param

  # check covariate data
  if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
    stop("Needs correct covariateData")
  }

  # get the settings from the param
  pySettings <- attr(param, "settings")

  # make sure the inputs are valid
  checkPySettings(pySettings)

  start <- Sys.time()

  if (!is.null(trainData$folds)) {
    trainData$labels <-
      merge(trainData$labels, trainData$fold, by = "rowId")
  }

  # convert the data to a sparse R Matrix and then use reticulate to convert to python sparse
  # need to save the covariateMap so we know the covariateId to columnId when applying model
  mappedData <- toSparseM(trainData)

  matrixData <- mappedData$dataMatrix
  labels <- mappedData$labels
  covariateRef <- mappedData$covariateRef

  # save the model to outLoc
  outLoc <- createTempModelLoc()

  cvResult <- do.call(
    what = gridCvPython,
    args = list(
      matrixData = matrixData,
      labels = labels,
      seed = pySettings$seed,
      requiresDenseMatrix = pySettings$requiresDenseMatrix,
      modelName = pySettings$name,
      pythonModule = pySettings$pythonModule,
      pythonClass = pySettings$pythonClass,
      modelLocation = outLoc,
      paramSearch = param,
      saveToJson = attr(param, "saveToJson")
    )
  )

  hyperSummary <-
    do.call(
      rbind,
      lapply(cvResult$paramGridSearch, function(x) {
        x$hyperSummary
      })
    )

  prediction <- cvResult$prediction

  variableImportance <- cvResult$variableImportance
  variableImportance[is.na(variableImportance)] <- 0

  incs <- rep(1, nrow(covariateRef))
  covariateRef$included <- incs
  covariateRef$covariateValue <-
    unlist(variableImportance) # check this is correct order

  comp <- start - Sys.time()

  result <- list(
    model = file.path(outLoc),
    preprocessing = list(
      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
      requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix
    ),
    prediction = prediction,
    modelDesign = PatientLevelPrediction::createModelDesign(
      targetId = attr(trainData, "metaData")$targetId,
      outcomeId = attr(trainData, "metaData")$outcomeId,
      restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings,
      covariateSettings = attr(trainData, "metaData")$covariateSettings,
      populationSettings = attr(trainData, "metaData")$populationSettings,
      featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
      preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
      modelSettings = modelSettings,
      splitSettings = attr(trainData, "metaData")$splitSettings,
      sampleSettings = attr(trainData, "metaData")$sampleSettings
    ),
    trainDetails = list(
      analysisId = analysisId,
      analysisSource = "",
      # TODO add from model
      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
      developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
      attrition = attr(trainData, "metaData")$attrition,
      trainingTime = paste(as.character(abs(comp)), attr(comp, "units")),
      trainingDate = Sys.Date(),
      modelName = pySettings$name,
      finalModelParameters = cvResult$finalParam,
      hyperParamSearch = hyperSummary
    ),
    covariateImportance = covariateRef
  )

  class(result) <- "plpModel"
  attr(result, "predictionFunction") <- "predictPythonSklearn"
  attr(result, "modelType") <- "binary"
  attr(result, "saveType") <-
    attr(param, "saveType") # in save/load plp
  attr(result, "saveToJson") <-
    attr(param, "saveToJson") # when saving in reticulate

  return(result)
}


predictPythonSklearn <- function(plpModel,
                                 data,
                                 cohort) {
  if (inherits(data, "plpData")) {
    # convert
    matrixObjects <- toSparseM(
      plpData = data,
      cohort = cohort,
      map = plpModel$covariateImportance %>%
        dplyr::select("columnId", "covariateId")
    )

    newData <- matrixObjects$dataMatrix
    cohort <- matrixObjects$labels
  } else {
    newData <- data
  }

  # load model
  if (attr(plpModel, "saveToJson")) {
    modelLocation <-
      reticulate::r_to_py(file.path(plpModel$model, "model.json"))
    model <- sklearnFromJson(path = modelLocation)
  } else {
    os <- reticulate::import("os")
    joblib <- reticulate::import("joblib", convert = FALSE)
    modelLocation <-
      reticulate::r_to_py(file.path(plpModel$model, "model.pkl"))
    model <- joblib$load(os$path$join(modelLocation))
  }
  pythonData <- reticulate::r_to_py(newData)

  # make dense if needed
  if (plpModel$preprocessing$requireDenseMatrix) {
    pythonData <- pythonData$toarray()
  }

  cohort <- predictValues(
    model = model,
    data = pythonData,
    cohort = cohort,
    type = attr(plpModel, "modelType")
  )

  return(cohort)
}

predictValues <- function(model, data, cohort, type = "binary") {
  predictionValue <- model$predict_proba(data)
  cohort$value <- reticulate::py_to_r(predictionValue)[, 2]

  cohort <- cohort %>%
    dplyr::select(-"rowId") %>%
    dplyr::rename(rowId = "originalRowId")

  attr(cohort, "metaData")$modelType <- type

  return(cohort)
}

checkPySettings <- function(settings) {
  checkIsClass(settings$seed, c("numeric", "integer"))
  ParallelLogger::logDebug(paste0("classifier seed: ", settings$seed))

  checkIsClass(settings$requiresDenseMatrix, c("logical"))
  ParallelLogger::logDebug(paste0("requiresDenseMatrix: ", settings$requiresDenseMatrix))

  checkIsClass(settings$name, c("character"))
  ParallelLogger::logDebug(paste0("name: ", settings$name))

  checkIsClass(settings$pythonModule, c("character"))
  ParallelLogger::logDebug(paste0("pythonModule: ", settings$pythonModule))

  checkIsClass(settings$pythonClass, c("character"))
  ParallelLogger::logDebug(paste0("pythonClass: ", settings$pythonClass))
}

gridCvPython <- function(matrixData,
                         labels,
                         seed,
                         requiresDenseMatrix,
                         modelName,
                         pythonModule,
                         pythonClass,
                         modelLocation,
                         paramSearch,
                         saveToJson) {
  ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model"))

  np <- reticulate::import("numpy")
  joblib <- reticulate::import("joblib")

  module <- reticulate::import(pythonModule, convert = FALSE)
  classifier <- module[pythonClass]

  ###########################################################################

  gridSearchPredictons <- list()
  length(gridSearchPredictons) <- length(paramSearch)

  for (gridId in 1:length(paramSearch)) {
    # initiate prediction
    prediction <- c()

    fold <- labels$index
    ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))

    for (i in 1:max(fold)) {
      ParallelLogger::logInfo(paste0("Fold ", i))
      trainY <- reticulate::r_to_py(labels$outcomeCount[fold != i])
      trainX <- reticulate::r_to_py(matrixData[fold != i, ])
      testX <- reticulate::r_to_py(matrixData[fold == i, ])

      if (requiresDenseMatrix) {
        ParallelLogger::logInfo("Converting sparse martix to dense matrix (CV)")
        trainX <- trainX$toarray()
        testX <- testX$toarray()
      }

      model <-
        fitPythonModel(
          classifier,
          paramSearch[[gridId]],
          seed,
          trainX,
          trainY,
          np,
          pythonClass
        )

      ParallelLogger::logInfo("Calculating predictions on left out fold set...")
      prediction <-
        rbind(
          prediction,
          predictValues(
            model = model,
            data = testX,
            cohort = labels[fold == i, ],
            type = "binary"
          )
        )
    }

    gridSearchPredictons[[gridId]] <- list(
      prediction = prediction,
      param = paramSearch[[gridId]]
    )
  }

  # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)

  paramGridSearch <-
    lapply(gridSearchPredictons, function(x) {
      do.call(computeGridPerformance, x)
    }) # cvAUCmean, cvAUC, param

  optimalParamInd <-
    which.max(unlist(lapply(paramGridSearch, function(x) {
      x$cvPerformance
    })))

  finalParam <- paramGridSearch[[optimalParamInd]]$param

  cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
  cvPrediction$evaluationType <- "CV"

  ParallelLogger::logInfo("Training final model using optimal parameters")

  trainY <- reticulate::r_to_py(labels$outcomeCount)
  trainX <- reticulate::r_to_py(matrixData)

  if (requiresDenseMatrix) {
    ParallelLogger::logInfo("Converting sparse martix to dense matrix (final model)")
    trainX <- trainX$toarray()
  }

  model <-
    fitPythonModel(
      classifier,
      finalParam,
      seed,
      trainX,
      trainY,
      np,
      pythonClass
    )

  ParallelLogger::logInfo("Calculating predictions on all train data...")
  prediction <-
    predictValues(
      model = model,
      data = trainX,
      cohort = labels,
      type = "binary"
    )
  prediction$evaluationType <- "Train"

  prediction <- rbind(
    prediction,
    cvPrediction
  )

  # saving model
  if (!dir.exists(file.path(modelLocation))) {
    dir.create(file.path(modelLocation), recursive = TRUE)
  }
  if (saveToJson) {
    sklearnToJson(
      model = model,
      path = file.path(modelLocation, "model.json")
    )
  } else {
    joblib$dump(model, file.path(modelLocation, "model.pkl"), compress = TRUE)
  }

  # feature importance
  variableImportance <-
    tryCatch(
      {
        reticulate::py_to_r(model$feature_importances_)
      },
      error = function(e) {
        ParallelLogger::logInfo(e)
        return(rep(1, ncol(matrixData)))
      }
    )

  return(
    list(
      prediction = prediction,
      finalParam = finalParam,
      paramGridSearch = paramGridSearch,
      variableImportance = variableImportance
    )
  )
}


fitPythonModel <-
  function(classifier,
           param,
           seed,
           trainX,
           trainY,
           np,
           pythonClass) {
    ParallelLogger::logInfo(paste0("data X dim: ", trainX$shape[0], "x", trainX$shape[1]))
    ParallelLogger::logInfo(paste0(
      "data Y length: ",
      np$shape(trainY)[[1]],
      " with ",
      np$sum(trainY),
      " outcomes"
    ))

    timeStart <- Sys.time()

    # print parameters
    # convert NULL to NA values
    paramString <- param
    for (ind in 1:length(paramString)) {
      if (is.null(paramString[[ind]])) {
        paramString[[ind]] <- "null"
      }
    }
    ParallelLogger::logInfo(paste(
      names(param),
      unlist(paramString),
      sep = ":",
      collapse = "    "
    ))

    if (!is.null(param)) {
      model <-
        do.call(
          paste0(pythonClass, "Inputs"),
          list(classifier = classifier, param = param)
        )
    } else {
      model <- classifier()
    }
    model <- model$fit(trainX, trainY)
    timeEnd <- Sys.time()

    ParallelLogger::logInfo(paste0(
      "Training model took (mins): ",
      difftime(timeEnd, timeStart, units = "mins")
    ))

    return(model)
  }



#' Computes grid performance with a specified performance function
#'
#' @param prediction a dataframe with predictions and outcomeCount per rowId
#' @param param a list of hyperparameters
#' @param performanceFunct a string specifying which performance function to use
#' . Default ``'compute_AUC'``
#' @return A list with overview of the performance
#' @examples
#' prediction <- data.frame(rowId = c(1, 2, 3, 4, 5),
#'                          outcomeCount = c(0, 1, 0, 1, 0),
#'                          value = c(0.1, 0.9, 0.2, 0.8, 0.3),
#'                          index = c(1, 1, 1, 1, 1))
#' param <- list(hyperParam1 = 5, hyperParam2 = 100)
#' computeGridPerformance(prediction, param, performanceFunct = "computeAuc")
#' @export
computeGridPerformance <-
  function(prediction, param, performanceFunct = "computeAuc") {
    performance <- do.call(
      what = eval(parse(text = performanceFunct)),
      args = list(prediction = prediction)
    )

    performanceFold <- c()
    for (i in 1:max(prediction$index)) {
      performanceFold <- c(
        performanceFold,
        do.call(
          what = eval(parse(text = performanceFunct)),
          args = list(prediction = prediction[prediction$index == i, ])
        )
      )
    }

    paramString <- param
    for (ind in 1:length(paramString)) {
      if (is.null(paramString[[ind]])) {
        paramString[[ind]] <- "null"
      }
    }

    paramValues <- unlist(paramString)
    names(paramValues) <- names(param)

    hyperSummary <- as.data.frame(c(
      data.frame(
        metric = performanceFunct,
        fold = c("CV", as.character(1:length(performanceFold))),
        value = c(performance, performanceFold)
      ),
      paramValues
    ))
    return(
      list(
        metric = performanceFunct,
        cvPerformance = performance,
        cvPerformancePerFold = performanceFold,
        param = param,
        hyperSummary = hyperSummary
      )
    )
  }

Any scripts or data that you put into this service are public.

PatientLevelPrediction documentation built on April 3, 2025, 9:58 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PatientLevelPrediction
Develop Clinical Prediction Models Using the Common Data Model

R/SklearnClassifier.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

Defines functions computeGridPerformance fitPythonModel gridCvPython checkPySettings predictValues predictPythonSklearn fitSklearn

Documented in computeGridPerformance

Try the PatientLevelPrediction package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientLevelPrediction Develop Clinical Prediction Models Using the Common Data Model

R/SklearnClassifier.R In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model

Defines functions computeGridPerformance fitPythonModel gridCvPython checkPySettings predictValues predictPythonSklearn fitSklearn

Documented in computeGridPerformance

Try the PatientLevelPrediction package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientLevelPrediction
Develop Clinical Prediction Models Using the Common Data Model

R/SklearnClassifier.R
In PatientLevelPrediction: Develop Clinical Prediction Models Using the Common Data Model