R/RunPlp.R

Defines functions characterize covariateSummary summary.runPlp runPlp

Documented in runPlp

# @file RunPlp.R
#
# Copyright 2020 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' runPlp - Train and evaluate the model
#'
#' @description
#' This provides a general framework for training patient level prediction models.  The user can select 
#' various default feature selection methods or incorporate their own,  The user can also select from
#' a range of default classifiers or incorporate their own.  There are three types of evaluations for the model
#' patient (randomly splits people into train/validation sets) or year (randomly splits data into train/validation sets
#' based on index year - older in training, newer in validation) or both (same as year spliting but checks there are
#' no overlaps in patients within training set and validaiton set - any overlaps are removed from validation set)
#' 
#' @details
#' Users can define a risk period of interest for the prediction of the outcome relative to index or use
#' the cohprt dates.  The user can then specify whether they wish to exclude patients who are not observed
#' during the whole risk period, cohort period or experienced the outcome prior to the risk period.
#'
#' @param population                       The population created using createStudyPopulation() who will be used to develop the model
#' @param plpData                          An object of type \code{plpData} - the patient level prediction
#'                                         data extracted from the CDM.
#' @param minCovariateFraction             The minimum fraction of target population who must have a covariate for it to be included in the model training                            
#' @param normalizeData                    Whether to normalise the covariates before training (Default: TRUE)      
#' @param modelSettings                    An object of class \code{modelSettings} created using one of the function:
#'                                         \itemize{
#'                                         \item{setLassoLogisticRegression()}{ A lasso logistic regression model}
#'                                         \item{setGradientBoostingMachine()}{ A gradient boosting machine}
#'                                         \item{setAdaBoost()}{ An ada boost model}
#'                                         \item{setRandomForest()}{ A random forest model}
#'                                         \item{setDecisionTree()}{ A decision tree model}
#'                                         \item{setCovNN())}{ A convolutional neural network model}
#'                                         \item{setCIReNN()}{ A recurrent neural network model}
#'                                         \item{setMLP()}{ A neural network model}
#'                                         \item{setDeepNN()}{ A deep neural network model}
#'                                         \item{setKNN()}{ A KNN model}
#'                                         
#'                                         }
#' @param testSplit                        Either 'stratified', 'subject' or 'time' specifying the type of evaluation used.
#'                                         'time' find the date where testFraction of patients had an index after the date and assigns patients with an index prior to this date into the training set and post the date into the test set
#'                                         'stratified' splits the data into test (1-testFraction of the data) and
#'                                         train (validationFraction of the data) sets.  The split is stratified by the class label. 'subject' split is useful
#'                                         when a subject is in the data multiple times and you want all rows for the same subject in either the test or the train set but not in both.
#' @param testFraction                     The fraction of the data to be used as the test set in the patient
#'                                         split evaluation.
#' @param trainFraction                    A real number between 0 and 1 indicating the train set fraction of the data.
#'                                         If not set trainFraction is equal to 1 - test
#' @param splitSeed                        The seed used to split the test/train set when using a person type testSplit                  
#' @param nfold                            The number of folds used in the cross validation (default 3)
#' @param indexes                          A dataframe containing a rowId and index column where the index value of -1 means in the test set, and positive integer represents the cross validation fold (default is NULL)
#' @param saveDirectory                    The path to the directory where the results will be saved (if NULL uses working directory)
#' @param savePlpData                      Binary indicating whether to save the plpData object (default is T)
#' @param savePlpResult                    Binary indicating whether to save the object returned by runPlp (default is T)
#' @param savePlpPlots                     Binary indicating whether to save the performance plots as pdf files (default is T)
#' @param saveEvaluation                   Binary indicating whether to save the oerformance as csv files (default is T)
#' @param verbosity                        Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:
#'                                         \itemize{
#'                                         \item{DEBUG}{Highest verbosity showing all debug statements}
#'                                         \item{TRACE}{Showing information about start and end of steps}
#'                                         \item{INFO}{Show informative information (Default)}
#'                                         \item{WARN}{Show warning messages}
#'                                         \item{ERROR}{Show error messages}
#'                                         \item{FATAL}{Be silent except for fatal errors}
#'                                         }
#' @param timeStamp                        If TRUE a timestamp will be added to each logging statement. Automatically switched on for TRACE level.
#' @param analysisId                       Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param runCovariateSummary              Whether to calculate the mean and sd for each covariate
#' @param save                             Old input - please now use saveDirectory
#' @return
#' An object containing the model or location where the model is save, the data selection settings, the preprocessing
#' and training settings as well as various performance measures obtained by the model.
#'
#' \item{predict}{A function that can be applied to new data to apply the trained model and make predictions}
#' \item{model}{A list of class \code{plpModel} containing the model, training metrics and model metadata}
#' \item{prediction}{A dataframe containing the prediction for each person in the test set }
#' \item{evalType}{The type of evaluation that was performed ('person' or 'time')}
#' \item{performanceTest}{A list detailing the size of the test sets}
#' \item{performanceTrain}{A list detailing the size of the train sets}
#' \item{time}{The complete time taken to do the model framework}
#'
#'
#' @export
#' @examples
#' \dontrun{
#' #******** EXAMPLE 1 ********* 
#' #load plpData:
#' plpData <- loadPlpData(file.path('C:','User','home','data'))
#' 
#' #create study population to develop model on
#' #require minimum of 365 days observation prior to at risk start
#' #no prior outcome and person must be observed for 365 after index (minTimeAtRisk)
#' #with risk window from 0 to 365 days after index
#' population <- createStudyPopulation(plpData,outcomeId=2042,
#'                                     firstExposureOnly = FALSE,
#'                                     washoutPeriod = 365,
#'                                     removeSubjectsWithPriorOutcome = TRUE,
#'                                     priorOutcomeLookback = 99999,
#'                                     requireTimeAtRisk = TRUE,
#'                                     minTimeAtRisk=365,
#'                                     riskWindowStart = 0,
#'                                     addExposureDaysToStart = FALSE,
#'                                     riskWindowEnd = 365,
#'                                     addExposureDaysToEnd = FALSE)
#' 
#' #lasso logistic regression predicting outcome 200 in cohorts 10 
#' #using no feature selection with a time split evaluation with 30% in test set
#' #70% in train set where the model hyper-parameters are selected using 3-fold cross validation:
#' #and results are saved to file.path('C:','User','home')
#' model.lr <- lassoLogisticRegression.set()
#' mod.lr <- runPlp(population=population,
#'                         plpData= plpData, minCovariateFraction = 0.001,
#'                         modelSettings = model.lr ,
#'                         testSplit = 'time', testFraction=0.3, 
#'                         nfold=3, indexes=NULL,
#'                         saveDirectory =file.path('C:','User','myPredictionName'),
#'                         verbosity='INFO')
#'  
#' #******** EXAMPLE 2 *********                                               
#' # Gradient boosting machine with a grid search to select hyper parameters  
#' # using the test/train/folds created for the lasso logistic regression above                       
#' model.gbm <- gradientBoostingMachine.set(rsampRate=c(0.5,0.9,1),csampRate=1, 
#'                            ntrees=c(10,100), bal=c(F,T),
#'                            max_depth=c(4,5), learn_rate=c(0.1,0.01))
#' mod.gbm <- runPlp(population=population,
#'                         plpData= plpData,
#'                         modelSettings = model.gbm,
#'                         testSplit = 'time', testFraction=0.3, 
#'                         nfold=3, indexes=mod.lr$indexes,
#'                         saveDirectory =file.path('C:','User','myPredictionName2'))
#' } 
runPlp <- function(population, plpData,  minCovariateFraction = 0.001, normalizeData=T,
                   modelSettings,
                   testSplit = 'stratified', testFraction=0.25, trainFraction = NULL, splitSeed=NULL, nfold=3, indexes=NULL,
                   saveDirectory=NULL, savePlpData=T,
                   savePlpResult=T, savePlpPlots = T, saveEvaluation = T,
                   verbosity="INFO", timeStamp=FALSE, analysisId=NULL, 
                   runCovariateSummary = T,
                   save=NULL
){
  
  if(!missing(save)){
    warning('save has been replaced with saveDirectory - please use this input from now on')
    if(is.null(saveDirectory)){saveDirectory <- save}
  }
  
  if(missing(verbosity)){
    verbosity <- "INFO"
  } else{
    if(!verbosity%in%c("DEBUG","TRACE","INFO","WARN","FATAL","ERROR", "NONE")){
      stop('Incorrect verbosity string')
    }
  }
  
  # log the start time:
  ExecutionDateTime <- Sys.time()
  
  # create an analysisid and folder to save the results
  start.all <- Sys.time()
  if(is.null(analysisId))
    analysisId <- gsub(':','',gsub('-','',gsub(' ','',start.all)))
  
  if(is.null(saveDirectory)){
    analysisPath <- file.path(getwd(),analysisId)
  } else {
    analysisPath <- file.path(saveDirectory,analysisId) 
  }
  
  if(verbosity!="NONE"){
    if(!dir.exists(analysisPath)){dir.create(analysisPath,recursive=T)}
  }
  logFileName = paste0(analysisPath,'/plplog.txt')
  
  # write log to both console and file (tee). 
  # note other appenders can be created, e.g., to webservice or database!
  clearLoggerType("PLP Log")
  if(verbosity!="NONE"){
    logger <- ParallelLogger::createLogger(name = "PLP Log",
                                           threshold = verbosity,
                                           appenders = list(ParallelLogger::createFileAppender(layout = ParallelLogger::layoutParallel,
                                                                                               fileName = logFileName)))
    ParallelLogger::registerLogger(logger)
  }
  
  ParallelLogger::logInfo(paste0('Patient-Level Prediction Package version ', utils::packageVersion("PatientLevelPrediction")))
  
  # get ids
  cohortId <- attr(population, "metaData")$cohortId
  outcomeId <- attr(population, "metaData")$outcomeId
  
  # add header to analysis log
  ParallelLogger::logInfo(sprintf('%-20s%s', 'AnalysisID: ',analysisId))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'CohortID: ', cohortId))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'OutcomeID: ', outcomeId))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'Cohort size: ', nrow(plpData$cohorts)))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'Covariates: ', nrow(plpData$covariateData$covariateRef)))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'Population size: ', nrow(population)))
  ParallelLogger::logInfo(sprintf('%-20s%s', 'Cases: ', sum(population$outcomeCount>0)))
  
  # check parameters
  ParallelLogger::logTrace('Parameter Check Started')
  ParallelLogger::logDebug(paste0('testSplit: ', testSplit))
  checkInStringVector(testSplit, c('person','time', 'stratified','subject'))
  ParallelLogger::logDebug(paste0('outcomeCount: ', sum(population[,'outcomeCount']>0)))
  checkHigherEqual(sum(population[,'outcomeCount']>0), 25)
  ParallelLogger::logDebug(paste0('plpData class: ', class(plpData)))
  checkIsClass(plpData, c('plpData'))
  ParallelLogger::logDebug(paste0('testfraction: ', testFraction))
  checkIsClass(testFraction, c('numeric','integer'))
  checkHigher(testFraction,0)
  checkHigher(-1*testFraction,-1)
  ParallelLogger::logDebug(paste0('nfold class: ', class(nfold)))
  ParallelLogger::logDebug(paste0('nfold: ', nfold))
  checkIsClass(nfold, c('numeric','integer'))
  checkHigher(nfold, 0)
  
  # if savePlpData
  if(savePlpData){
    ParallelLogger::logInfo(sprintf('%-20s%s', 'Saving plpData to ', file.path(analysisPath,'plpData')))
    savePlpData(plpData, file.path(analysisPath,'plpData'))
  }
  
  # construct the settings for the model pipeline
  if(is.null(indexes)){
    if(testSplit=='stratified'){
      ParallelLogger::logTrace('Dataset stratified split started')
      if(is.null(splitSeed)){ #keep record of splitSeed
        splitSeed <- sample(20000000,1)-10000000
        ParallelLogger::logInfo(paste0('splitSeed: ', splitSeed))
      } 
      indexes <-tryCatch(randomSplitter(population, test=testFraction, train = trainFraction, nfold=nfold, seed=splitSeed),
                         finally=ParallelLogger::logTrace('Done.'))
    }
    if(testSplit=='subject'){
      ParallelLogger::logTrace('Dataset subject split started')
      if(is.null(splitSeed)){ #keep record of splitSeed
        splitSeed <- sample(20000000,1)-10000000
        ParallelLogger::logInfo(paste0('splitSeed: ', splitSeed))
      } 
      indexes <-tryCatch(subjectSplitter(population, test=testFraction, train = trainFraction, nfold=nfold, seed=splitSeed),
                         finally=ParallelLogger::logTrace('Done.'))
    }
    if(testSplit=='time'){
      ParallelLogger::logTrace('Dataset time split started')
      indexes <-tryCatch(timeSplitter(population, test=testFraction, train = trainFraction, nfold=nfold),
                         finally=ParallelLogger::logTrace('Done.'))
    }
    if(testSplit=='person'){
      ParallelLogger::logTrace('Dataset person split started')
      if(is.null(splitSeed)){ #keep record of splitSeed
        splitSeed <- sample(20000000,1)-10000000
        ParallelLogger::logInfo(paste0('splitSeed: ', splitSeed))
      } 
      indexes <- tryCatch(personSplitter(population, test=testFraction, train = trainFraction, nfold=nfold, seed=splitSeed),
                          finally= ParallelLogger::logTrace('Done.')
      )
    }
  }
  
  if(nrow(population)!=nrow(indexes)){
    ParallelLogger::logError(sprintf('Population dimension not compatible with indexes: %d <-> %d', nrow(population), nrow(indexes)))
    stop('Population dimension not compatible with indexes')
  }
  
  # train the model
  tempmeta <- attr(population, "metaData")
  population <- merge(population, indexes, by = 'rowId')
  colnames(population)[colnames(population)=='index'] <- 'indexes'
  attr(population, "metaData") <- tempmeta
  
  settings <- list(data=plpData, minCovariateFraction=minCovariateFraction,
                   normalizeData = normalizeData,
                   modelSettings = modelSettings,
                   population=population,
                   cohortId=cohortId,
                   outcomeId=outcomeId)
  
  ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name))  
  # the call is sinked because of the external calls (Python etc)
  if (sink.number()>0){
    ParallelLogger::logWarn(paste0('sink had ',sink.number(),' connections open!'))
  }
  #sink(logFileName, append = TRUE, split = TRUE)
  
  model <- tryCatch(do.call(fitPlp, settings),
                    error = function(e) {
                      stop(paste0(e))},
                    finally = {
                      ParallelLogger::logTrace('Done fitting.')})
  model$analysisId <- analysisId # adding this so we can link validation to models
  
  # get train prediction and remove it from model
  predictionTrain <- model$predictionTrain
  model$predictionTrain <- NULL
  
  # create test subset of population
  populationTest <- population[population$indexes<0,]
  attr(populationTest, 'metaData') <- attr(population, 'metaData')
  # calculate metrics
  ParallelLogger::logTrace('Prediction')
  predictionTest <- tryCatch(predictPlp(plpModel = model, 
                                        population = populationTest, #population, 
                                        plpData = plpData, 
                                        index = NULL), 
                             finally = ParallelLogger::logTrace('Done predict.'))
  
  prediction <- rbind(predictionTest, predictionTrain[,colnames(predictionTest)])
  
  ParallelLogger::logDebug(paste0('prediction null: ', is.null(prediction)))
  ParallelLogger::logDebug(paste0('prediction unique values: ', length(unique(prediction$value))))
  if(ifelse(is.null(prediction), FALSE, length(unique(prediction$value))>1)){
    
    # add analysisID
    attr(prediction, "metaData")$analysisId <- analysisId
    
    ParallelLogger::logInfo('Train set evaluation')
    performance.train <- evaluatePlp(prediction[prediction$indexes>0,], plpData)
    ParallelLogger::logTrace('Done.')
    ParallelLogger::logInfo('Test set evaluation')
    performance.test <- evaluatePlp(prediction[prediction$indexes<0,], plpData)
    ParallelLogger::logTrace('Done.')
    
    # now combine the test and train data and add analysisId
    performance <- reformatPerformance(train=performance.train, test=performance.test, analysisId)
    
    if(saveEvaluation){
      ParallelLogger::logTrace('Saving evaluation csv files')
      if(!dir.exists( file.path(analysisPath, 'evaluation') ))
        dir.create(file.path(analysisPath, 'evaluation'))
      tryCatch(utils::write.csv(performance$evaluationStatistics, file.path(analysisPath, 'evaluation', 'evaluationStatistics.csv'), row.names=F ),
               finally= ParallelLogger::logTrace('Saved EvaluationStatistics.')
      )
      tryCatch(utils::write.csv(performance$thresholdSummary, file.path(analysisPath, 'evaluation', 'thresholdSummary.csv'), row.names=F ),
               finally= ParallelLogger::logTrace('Saved ThresholdSummary.')
      )
      tryCatch(utils::write.csv(performance$demographicSummary, file.path(analysisPath, 'evaluation', 'demographicSummary.csv'), row.names=F),
               finally= ParallelLogger::logTrace('Saved DemographicSummary.')
      )
      tryCatch(utils::write.csv(performance$calibrationSummary, file.path(analysisPath, 'evaluation', 'calibrationSummary.csv'), row.names=F),
               finally= ParallelLogger::logTrace('Saved CalibrationSummary.')
      )
      tryCatch(utils::write.csv(performance$predictionDistribution, file.path(analysisPath, 'evaluation', 'predictionDistribution.csv'), row.names=F),
               finally= ParallelLogger::logTrace('Saved PredictionDistribution.')
      )
    }
    
  }else{
    ParallelLogger::logWarn(paste0('Evaluation not possible as prediciton NULL or all the same values'))
    performance.test <- NULL
    performance.train <- NULL
    performance <- NULL
  }
  
  # log the end time:
  endTime <- Sys.time()
  TotalExecutionElapsedTime <- difftime(endTime, ExecutionDateTime, units='mins')
  
  # 1) input settings:
  inputSetting <- list(dataExtrractionSettings=plpData$metaData$call,
                       populationSettings=attr(population, "metaData"),
                       modelSettings = modelSettings,
                       testSplit = testSplit, 
                       testFraction= testFraction,
                       nfold=nfold,
                       splitSeed = splitSeed)
  
  # 2) Executionsummary details:
  executionSummary <- list(PackageVersion = list(rVersion= R.Version()$version.string,
                                                 packageVersion = utils::packageVersion("PatientLevelPrediction")),
                           PlatformDetails= list(platform= R.Version()$platform,
                                                 cores= Sys.getenv('NUMBER_OF_PROCESSORS'),
                                                 RAM=benchmarkme::get_ram()),
                           # Sys.info()
                           TotalExecutionElapsedTime = TotalExecutionElapsedTime,
                           ExecutionDateTime = ExecutionDateTime,
                           Log = logFileName # location for now
                           #Not available at the moment: CDM_SOURCE -  meta-data containing CDM version, release date, vocabulary version
  )
  
  if(runCovariateSummary){
    ParallelLogger::logInfo(paste0('Calculating covariate summary @ ', Sys.time()))
    ParallelLogger::logInfo('This can take a while...')
    covSummary <- covariateSummary(plpData, population, model)
    
    if(saveEvaluation){
      ParallelLogger::logTrace('Saving covariate summary as csv')
      if(!dir.exists( file.path(analysisPath, 'evaluation') ))
        dir.create(file.path(analysisPath, 'evaluation'))
      tryCatch(utils::write.csv(covSummary, file.path(analysisPath, 'evaluation', 'covariateSummary.csv'), row.names=F ),
               finally= ParallelLogger::logTrace('Saved covariate summary.')
      )
    }
    ParallelLogger::logInfo(paste0('Finished covariate summary @ ', Sys.time()))
    
  } else{
    ParallelLogger::logInfo('Skipping covariate summary')
    covSummary <- NULL
  }
  

  results <- list(inputSetting=inputSetting,
                  executionSummary=executionSummary,
                  model=model,
                  prediction=prediction,
                  performanceEvaluation=performance,
                  covariateSummary=covSummary,
                  analysisRef=list(analysisId=analysisId,
                                   analysisName=NULL,#analysisName,
                                   analysisSettings= NULL))
  class(results) <- c('runPlp')
  
  # save the plots?
  if(savePlpPlots & !is.null(performance)){
    plotPlp(result = results, filename = file.path(analysisPath))
  }
  
  # save the results
  if(savePlpResult){
    ParallelLogger::logInfo(paste0('Saving PlpResult'))
    tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')),
             finally= ParallelLogger::logTrace('Done.'))
    ParallelLogger::logInfo(paste0('plpResult saved to ..\\', analysisPath ,'\\plpResult'))
    
    # update from temp location to saved location
    results$model <- updateModelLocation(results$model, file.path(analysisPath,'plpResult'))
  }
  
  
  ParallelLogger::logInfo(paste0('Log saved to ',logFileName))  
  ParallelLogger::logInfo("Run finished successfully.")
  
  # stop logger
  ParallelLogger::clearLoggers()
  logger <- ParallelLogger::createLogger(name = "SIMPLE",
                                         threshold = "INFO",
                                         appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
  ParallelLogger::registerLogger(logger)
  
  return(results)
  
}


#' @export
summary.runPlp <- function(object, ...) {
  
  if(object$model$modelSettings$model=="lr_lasso"){
    hyper <-  paste0("The final model hyper-parameters were - variance: ",format(as.double(object$model$hyperParamSearch['priorVariance']), digits = 5))
  } else if(is.null(object$model$hyperParamSearch)){
    hyper <- 'No hyper-parameters...'
  } else {
    finalmod <- object$model$hyperParamSearch[which.max(object$model$hyperParamSearch$cv_auc),]
    finalmod <- finalmod[,!colnames(finalmod)%in%c('seed','cv_auc')]
    hyper <- paste0("The final model hyper-parameters were -", 
                    paste(colnames(finalmod), finalmod, collapse='; ', sep=': ')
    )
  }
  
  writeLines(paste0("The study was started at: ", object$executionSummary$ExecutionDateTime, 
                    " and took at total of ", as.double(object$executionSummary$TotalExecutionElapsedTime, unit='mins'),
                    " minutes.  ", hyper))
  
  aucInd <- object$performanceEvaluation$evaluationStatistics[,'Eval']=='test' & 
    object$performanceEvaluation$evaluationStatistics[,'Metric']%in%c('auc','AUC.auc')
  
  brierScoreInd <- object$performanceEvaluation$evaluationStatistics[,'Eval']=='test' & 
    object$performanceEvaluation$evaluationStatistics[,'Metric']%in%c('BrierScore')
  
  brierScaledInd <- object$performanceEvaluation$evaluationStatistics[,'Eval']=='test' & 
    object$performanceEvaluation$evaluationStatistics[,'Metric']%in%c('BrierScaled')
  
  calibrationSlopeInd <- object$performanceEvaluation$evaluationStatistics[,'Eval']=='test' & 
    object$performanceEvaluation$evaluationStatistics[,'Metric']%in%c('CalibrationSlope.Gradient')
  
  calibrationInterceptInd <- object$performanceEvaluation$evaluationStatistics[,'Eval']=='test' & 
    object$performanceEvaluation$evaluationStatistics[,'Metric']%in%c('CalibrationIntercept.Intercept')
  
  result <- list(cohortId=attr(object$prediction, "metaData")$cohortId,
                 outcomeId=attr(object$prediction, "metaData")$outcomeId,
                 model= object$model$modelSettings$model,
                 parameters = object$model$modelSettings$param,
                 hyperParamsearch = object$model$hyperParamSearch,
                 elaspsedTime = object$executionSummary$TotalExecutionElapsedTime,
                 AUC = object$performanceEvaluation$evaluationStatistics[aucInd,'Value'],
                 BrierScore = object$performanceEvaluation$evaluationStatistics[brierScoreInd,'Value'],
                 BrierScaled = object$performanceEvaluation$evaluationStatistics[brierScaledInd,'Value'],
                 CalibrationIntercept = object$performanceEvaluation$evaluationStatistics[calibrationInterceptInd,'Value'],
                 CalibrationSlope = object$performanceEvaluation$evaluationStatistics[calibrationSlopeInd,'Value']
                 
  )
  class(result) <- "summary.runPlp"
  return(result)
}



# this function calcualtes:
# CovariateCount	CovariateCountWithOutcome	
# CovariateCountWithNoOutcome	CovariateMeanWithOutcome	
# CovariateMeanWithNoOutcome	CovariateStDevWithOutcome	
# CovariateStDevWithNoOutcome	CovariateStandardizedMeanDifference
covariateSummary <- function(plpData, population = NULL, model = NULL){
  #===========================
  # all 
  #===========================
  if(!is.null(model$varImp)){
    variableImportance <- tibble::as_tibble(model$varImp[,colnames(model$varImp)!='covariateName'])
  } else{
    variableImportance <- tibble::tibble(covariateId = 1,
                                         covariateValue = 0)
  }
  
  # create population with index rowId
  if(!is.null(population)){
    if('indexes' %in% colnames(population)){
      plpData$covariateData$population <- population %>%
        dplyr::mutate(test = indexes<0) %>%
        dplyr::select(rowId, test, outcomeCount )
    } else{
      plpData$covariateData$population <- population %>%
        dplyr::select(rowId, outcomeCount)
    }
    
  } else{
    plpData$covariateData$population <- plpData$cohorts  %>%
      dplyr::select(rowId)
  }
  RSQLite::dbExecute(plpData$covariateData, "CREATE INDEX pop_rowId ON population(rowId)")
  on.exit(plpData$covariateData$population <- NULL, add = TRUE)
  
  
  
  # now join cov and pop
  covariates <- plpData$covariateData$covariates %>% 
    dplyr::inner_join(plpData$covariateData$population, by= 'rowId')
  
  # now group by covariateId and groupVal to get sum and square sums
  if(!is.null(population)){
    
    if('indexes' %in% colnames(population)){
      plpData$covariateData$totals <- plpData$covariateData$population %>% 
        dplyr::group_by(test, outcomeCount) %>%
        dplyr::summarise(N = dplyr::n())
      
      result <- covariates %>%
        dplyr::group_by(covariateId,test, outcomeCount) %>%
        dplyr::summarise(CovariateCount = dplyr::n(),
                         sumVal = sum(covariateValue,na.rm = TRUE),
                         sumSquares = sum(covariateValue^2,na.rm = TRUE)) %>%
        dplyr::inner_join(plpData$covariateData$totals, by= c('test', 'outcomeCount')) %>%
        dplyr::mutate(CovariateMean = sumVal/N,
                      CovariateStDev = sqrt(sumSquares/N - (sumVal/N)^2 )) %>% 
        dplyr::collect()
      
      
    } else {
      
      plpData$covariateData$totals <- plpData$covariateData$population %>% 
        dplyr::group_by(outcomeCount) %>%
        dplyr::summarise(N = dplyr::n())
      
      result <- covariates %>%
        dplyr::group_by(covariateId,outcomeCount) %>%
        dplyr::summarise(CovariateCount = dplyr::n(),
                         sumVal = sum(covariateValue,na.rm = TRUE),
                         sumSquares = sum(covariateValue^2,na.rm = TRUE)) %>%
        dplyr::inner_join(plpData$covariateData$totals, by= c( 'outcomeCount')) %>%
        dplyr::mutate(CovariateMean = sumVal/N,
                      CovariateStDev = sqrt(sumSquares/N - (sumVal/N)^2 )) %>% 
        dplyr::collect()
      
    }
  }else{
    # get all results:
    N <- nrow(plpData$cohorts)
    resultAll <-  covariates %>%
      dplyr::group_by(covariateId) %>%
      dplyr::summarise(CovariateCount = dplyr::n(),
                       sumVal = sum(covariateValue,na.rm = TRUE),
                       sumSquares = sum(covariateValue^2,na.rm = TRUE)) %>%
      dplyr::mutate(CovariateMean = 1.0*sumVal/!!N,
                    CovariateStDev = sqrt(sumSquares*1.0/!!N - (sumVal*1.0/!!N)^2 )) %>%
      dplyr::select(covariateId, CovariateCount, CovariateMean, CovariateStDev) %>%
      dplyr::collect()
    
  }
  
  
  if(!is.null(population)){
    
    resultAll <-  result %>%
      dplyr::group_by(covariateId) %>%
      dplyr::summarise(CovariateCount = sum(CovariateCount,na.rm = TRUE),
                       sumValall = sum(sumVal,na.rm = TRUE),
                       sumSquaresall = sum(sumSquares,na.rm = TRUE),
                       Nall = sum(N,na.rm = TRUE)) %>%
      dplyr::mutate(CovariateMean = sumValall/Nall,
                    CovariateStDev = sqrt(sumSquaresall/Nall - (sumValall/Nall)^2)) %>% 
      dplyr::select(covariateId, CovariateCount, CovariateMean, CovariateStDev) 
    
    
    if('indexes' %in% colnames(population)){
      
      resultOut <- result %>%
        dplyr::group_by(covariateId, outcomeCount) %>%
        dplyr::summarise(CovariateCount = sum(CovariateCount,na.rm = TRUE),
                         sumValall = sum(sumVal,na.rm = TRUE),
                         sumSquaresall = sum(sumSquares,na.rm = TRUE),
                         Nall = sum(N,na.rm = TRUE)) %>%
        dplyr::mutate(CovariateMean = sumValall/Nall,
                      CovariateStDev = sqrt(sumSquaresall/Nall - (sumValall/Nall)^2)) 
      
      resultOut1 <- resultOut %>% 
        dplyr::filter(outcomeCount == 0) %>%
        dplyr::mutate(CovariateCountWithNoOutcome = CovariateCount,
                      CovariateMeanWithNoOutcome = CovariateMean,
                      CovariateStDevWithNoOutcome = CovariateStDev) %>%
        dplyr::select(covariateId, 
                      CovariateCountWithNoOutcome, 
                      CovariateMeanWithNoOutcome,
                      CovariateStDevWithNoOutcome)
      
      resultOut2 <- resultOut %>% 
        dplyr::filter(outcomeCount == 1) %>%
        dplyr::mutate(CovariateCountWithOutcome = CovariateCount,
                      CovariateMeanWithOutcome = CovariateMean,
                      CovariateStDevWithOutcome = CovariateStDev) %>%
        dplyr::select(covariateId, 
                      CovariateCountWithOutcome, 
                      CovariateMeanWithOutcome,
                      CovariateStDevWithOutcome)
      
      resultAll <- resultAll %>%
        dplyr::left_join(resultOut1, by = 'covariateId') %>%
        dplyr::left_join(resultOut2, by = 'covariateId')
      
      resultAll <- resultAll %>% dplyr::mutate(StandardizedMeanDiff = (CovariateMeanWithOutcome - CovariateMeanWithNoOutcome)/sqrt(CovariateStDevWithOutcome^2 + CovariateStDevWithNoOutcome^2) )
      
      resultSplit1 <- result %>% 
        dplyr::filter(outcomeCount == 0 & test == TRUE) %>%
        dplyr::mutate(TestCovariateCountWithNoOutcome = CovariateCount,
                      TestCovariateMeanWithNoOutcome = CovariateMean,
                      TestCovariateStDevWithNoOutcome = CovariateStDev) %>%
        dplyr::ungroup() %>%
        dplyr::select(covariateId, 
                      TestCovariateCountWithNoOutcome, 
                      TestCovariateMeanWithNoOutcome,
                      TestCovariateStDevWithNoOutcome)
      
      resultSplit2 <- result %>% 
        dplyr::filter(outcomeCount == 0 & test == FALSE) %>%
        dplyr::mutate(TrainCovariateCountWithNoOutcome = CovariateCount,
                      TrainCovariateMeanWithNoOutcome = CovariateMean,
                      TrainCovariateStDevWithNoOutcome = CovariateStDev) %>%
        dplyr::ungroup() %>%
        dplyr::select(covariateId, 
                      TrainCovariateCountWithNoOutcome, 
                      TrainCovariateMeanWithNoOutcome,
                      TrainCovariateStDevWithNoOutcome)
      
      resultSplit3 <- result %>% 
        dplyr::filter(outcomeCount == 1 & test == TRUE) %>%
        dplyr::mutate(TestCovariateCountWithOutcome = CovariateCount,
                      TestCovariateMeanWithOutcome = CovariateMean,
                      TestCovariateStDevWithOutcome = CovariateStDev) %>%
        dplyr::ungroup() %>%
        dplyr::select(covariateId, 
                      TestCovariateCountWithOutcome, 
                      TestCovariateMeanWithOutcome,
                      TestCovariateStDevWithOutcome)
      
      resultSplit4 <- result %>% 
        dplyr::filter(outcomeCount == 1 & test == FALSE) %>%
        dplyr::mutate(TrainCovariateCountWithOutcome = CovariateCount,
                      TrainCovariateMeanWithOutcome = CovariateMean,
                      TrainCovariateStDevWithOutcome = CovariateStDev) %>%
        dplyr::ungroup() %>%
        dplyr::select(covariateId, 
                      TrainCovariateCountWithOutcome, 
                      TrainCovariateMeanWithOutcome,
                      TrainCovariateStDevWithOutcome)
      
      resultAll <- resultAll %>%
        dplyr::left_join(resultSplit1, by = 'covariateId') %>%
        dplyr::left_join(resultSplit2, by = 'covariateId') %>%
        dplyr::left_join(resultSplit3, by = 'covariateId') %>%
        dplyr::left_join(resultSplit4, by = 'covariateId')
      
    } else{
      
      resultOut1 <- result %>% 
        dplyr::filter(outcomeCount == 0) %>%
        dplyr::mutate(CovariateCountWithNoOutcome = CovariateCount,
                      CovariateMeanWithNoOutcome = CovariateMean,
                      CovariateStDevWithNoOutcome = CovariateStDev) %>%
        dplyr::select(covariateId, 
                      CovariateCountWithNoOutcome, 
                      CovariateMeanWithNoOutcome,
                      CovariateStDevWithNoOutcome)
      
      resultOut2 <- result %>% 
        dplyr::filter(outcomeCount == 1) %>%
        dplyr::mutate(CovariateCountWithOutcome = CovariateCount,
                      CovariateMeanWithOutcome = CovariateMean,
                      CovariateStDevWithOutcome = CovariateStDev) %>%
        dplyr::select(covariateId, 
                      CovariateCountWithOutcome, 
                      CovariateMeanWithOutcome,
                      CovariateStDevWithOutcome)
      
      resultAll <- resultAll %>%
        dplyr::left_join(resultOut1, by = 'covariateId') %>%
        dplyr::left_join(resultOut2, by = 'covariateId')
      
      resultAll <- resultAll %>% dplyr::mutate(StandardizedMeanDiff = (CovariateMeanWithOutcome - CovariateMeanWithNoOutcome)/sqrt(CovariateStDevWithOutcome^2 + CovariateStDevWithNoOutcome^2) )
      
    }
  }
  
  
  resultAll <- plpData$covariateData$covariateRef %>% dplyr::collect() %>%
    dplyr::left_join(variableImportance, 
                     by ='covariateId') %>%
    dplyr::left_join(resultAll, by ='covariateId')
  
  resultAll$covariateValue[is.na(resultAll$covariateValue)] <- 0
  
  resultAll[is.na(resultAll)] <- 0
  
  return(resultAll)  
}

characterize <- function(plpData, population, N=1){

  if(!missing(population)){
  plpData$covariateData$population <- tibble::as_tibble(population) %>% 
    dplyr::select(rowId, outcomeCount)
  on.exit(plpData$covariateData$population <- NULL, add = TRUE)
  # restrict to pop
  covariates <- plpData$covariateData$covariates %>% 
    dplyr::inner_join(plpData$covariateData$population)
  } else{
    covariates <- plpData$covariateData$covariates
  }
  
  
  popSize <- as.data.frame(covariates %>% dplyr::select(rowId) %>% 
    dplyr::summarise(counts = dplyr::n_distinct(rowId)))$counts

  allPeople <- covariates %>% 
    dplyr::group_by(covariateId) %>%
    dplyr::summarise(CovariateCount = dplyr::n(),
                     CovariateFraction = 1.0*dplyr::n()/popSize) %>%
    dplyr::filter(CovariateCount >= N) %>%
    dplyr::select(covariateId, CovariateCount,CovariateFraction)
  allPeople <- as.data.frame(allPeople)
  
  return(allPeople)
}
hxia/plp-git-demo documentation built on March 19, 2021, 1:54 a.m.