R/RunMultiplePlp.R

Defines functions getNames getDataSettings getSettingFromId getSettingsTable getIdsForSetting getSettingValues getidList validateMultiplePlp loadPlpAnalysesJson savePlpAnalysesJson createModelDesign runMultiplePlp

Documented in createModelDesign loadPlpAnalysesJson runMultiplePlp savePlpAnalysesJson validateMultiplePlp

# @file RunMultiplePlp.R
#
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


#' Run a list of predictions analyses
#'
#' @details
#' This function will run all specified predictions as defined using . 
#'
#' @param databaseDetails               The database settings created using \code{createDatabaseDetails()}
#' @param modelDesignList                A list of model designs created using \code{createModelDesign()}
#' @param onlyFetchData                  Only fetches and saves the data object to the output folder without running the analysis.
#' @param splitSettings                  The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()}
#' @param cohortDefinitions               A list of cohort definitions for the target and outcome cohorts
#' @param logSettings                    The setting spexcifying the logging for the analyses created using \code{createLogSettings()}
#' @param saveDirectory                   Name of the folder where all the outputs will written to.
#' 
#' @return
#' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
#' for a set of analysis choices.\cr \verb{cohortId} \tab The ID of the target cohort populations.\cr
#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved \cr \verb{evaluationFolder} \tab The name of file containing the evaluation saved as a csv
#'  \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
#'
#' @export
runMultiplePlp <- function(
  databaseDetails = createDatabaseDetails(),
  modelDesignList = list(
    createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), 
    createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
  ),
  onlyFetchData = F,
  splitSettings = createDefaultSplitSetting(
    type = "stratified", 
    testFraction = 0.25,
    trainFraction = 0.75, 
    splitSeed = 123, 
    nfold = 3
  ),
  cohortDefinitions = NULL,
  logSettings = createLogSettings(
    verbosity = "DEBUG", 
    timeStamp = T, 
    logName = "runPlp Log"
  ),
  saveDirectory = getwd()
){
  
  #input checks
  checkIsClass(databaseDetails, c('databaseDetails'))
  checkIsClass(modelDesignList, c('list', 'modelDesign'))
  checkIsClass(onlyFetchData, 'logical')
  checkIsClass(splitSettings, 'splitSettings')
  checkIsClass(logSettings, 'logSettings')
  checkIsClass(saveDirectory, 'character')
  if(!dir.exists(saveDirectory)){
    dir.create(saveDirectory, recursive = T)
  }
  
  # get idList
  idList <- getidList(modelDesignList = modelDesignList)
  
  # get settings data.frame
  settingstable <- getSettingsTable(
    modelDesignList = modelDesignList, 
    idList = idList
  )
  
  if(!is.null(cohortDefinitions)){
    cohortNames <- data.frame(
      targetName = getNames(cohortDefinitions, settingstable$targetId),
      outcomeName = getNames(cohortDefinitions, settingstable$outcomeId)
      )
    settingstable <- cbind(cohortNames, settingstable)
  }
  
  utils::write.csv(settingstable, file.path(saveDirectory,'settings.csv'), row.names = F)
  saveJsonFile(idList, file.path(saveDirectory,'settings.json'))
  
  # list(targetId, covariateSetting, outcomeIds, saveLocation)
  dataSettings <- getDataSettings(settingstable)
  
  # extract data
  for(i in 1:length(dataSettings)){
    dataExists <- length(dir(file.path(saveDirectory, dataSettings[[i]]$dataLocation)))>0
    if(!dataExists){
      ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings[[i]]$targetId, 'to', file.path(saveDirectory, dataSettings[[i]]$dataLocation)))
      
      databaseDetails$cohortId <- dataSettings[[i]]$targetId
      databaseDetails$outcomeIds <- dataSettings[[i]]$outcomeIds
      
      plpDataSettings <- list(
        databaseDetails = databaseDetails,
        covariateSettings = getSettingFromId(idList, type = 'covariateSettings', dataSettings[[i]]$covariateSettings),
        restrictPlpDataSettings = getSettingFromId(idList, type = 'restrictPlpDataSettings', dataSettings[[i]]$restrictPlpDataSettings)
        )
      
      
      plpData <- tryCatch(
        {do.call(getPlpData, plpDataSettings)},
        error = function(e){ParallelLogger::logInfo(e); return(NULL)}
      )
      if(!is.null(plpData)){
        savePlpData(plpData, file.path(saveDirectory, dataSettings[[i]]$dataLocation))
      }
    } else{
      ParallelLogger::logInfo(paste('Data for cohort', dataSettings[[i]]$targetId, 'exists at', file.path(saveDirectory, dataSettings[[i]]$dataLocation)))
    }
  }
  
  # runPlp
  if(!onlyFetchData){
    for(i in 1:nrow(settingstable)){
      modelDesign <- modelDesignList[[i]]
      settings <- settingstable[i,]
      
      dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0
      
      if(dataExists){
        plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation))
        
        analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'plpResult', 'runPlp.rds'))
        if(!analysisExists){
          
          runPlpSettings <- list(
            plpData = plpData,
            outcomeId = modelDesign$outcomeId,
            analysisId = settings$analysisId,
            populationSettings = modelDesign$populationSettings,
            splitSettings = splitSettings,
            sampleSettings = modelDesign$sampleSettings,
            featureEngineeringSettings = modelDesign$featureEngineeringSettings,
            preprocessSettings = modelDesign$preprocessSettings,
            modelSettings = modelDesign$modelSettings,
            logSettings = logSettings,
            executeSettings = modelDesign$executeSettings,
            saveDirectory = saveDirectory
          )
          
          result <- tryCatch(
            {do.call(runPlp, runPlpSettings)},
            error = function(e){ParallelLogger::logInfo(e); return(NULL)}
          )
        } else{
          ParallelLogger::logInfo(paste('Analysis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId)))
        }
      }
    }
    
    
  }
  return(invisible(settingstable))
}


#' Specify settings for deceloping a single model 
#'
#' @details
#' This specifies a single analysis for developing as single model
#'
#' @param targetId              The id of the target cohort that will be used for data extraction (e.g., the ATLAS id)
#' @param outcomeId              The id of the outcome that will be used for data extraction (e.g., the ATLAS id)
#' @param restrictPlpDataSettings       The settings specifying the extra restriction settings when extracting the data created using \code{createRestrictPlpDataSettings()}.
#' @param populationSettings             The population settings specified by \code{createStudyPopulationSettings()}
#' @param covariateSettings              The covariate settings, this can be a list or a single \code{'covariateSetting'} object.
#' @param featureEngineeringSettings      Either NULL or an object of class \code{featureEngineeringSettings} specifying any feature engineering used during model development
#' @param sampleSettings                  Either NULL or an object of class \code{sampleSettings} with the over/under sampling settings used for model development
#' @param preprocessSettings              Either NULL or an object of class \code{preprocessSettings} created using \code{createPreprocessingSettings()}
#' @param modelSettings                   The model settings such as \code{setLassoLogisticRegression()}
#' @param runCovariateSummary             Whether to run the covariateSummary
#' 
#' @return
#' A list with analysis settings used to develop a single prediction model
#'
#' @export
createModelDesign <- function(
  targetId,
  outcomeId,
  restrictPlpDataSettings = createRestrictPlpDataSettings(),
  populationSettings = createStudyPopulationSettings(),
  covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
  featureEngineeringSettings = NULL,
  sampleSettings = NULL,
  preprocessSettings = NULL,
  modelSettings = NULL,
  runCovariateSummary = T
){
  
  checkIsClass(targetId, c('numeric','integer'))
  checkIsClass(outcomeId, c('numeric','integer'))
  
  checkIsClass(populationSettings, c('populationSettings'))
  checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings')
  checkIsClass(covariateSettings, c('covariateSettings', 'list'))
  
  useFE <- F
  if(!is.null(featureEngineeringSettings)){
    checkIsClass(featureEngineeringSettings, c('featureEngineeringSettings'))
    useFE <- T
  } else{
    featureEngineeringSettings <- createFeatureEngineeringSettings(type = "none")
  }
  
  useSample <- F
  if(!is.null(sampleSettings)){
    checkIsClass(sampleSettings, c('sampleSettings'))
    useSample <- T
  } else{
    sampleSettings <- createSampleSettings(type = "none")
  }
  
  usePreprocess <- F
  if(!is.null(preprocessSettings)){
    checkIsClass(preprocessSettings, c('preprocessSettings'))
    usePreprocess <- T
  } else{
    preprocessSettings <- createPreprocessSettings(
      minFraction = 0.001,
      normalize = T
    )
  }
  
  checkIsClass(modelSettings, c('modelSettings'))
  
  settings <- list(
    targetId = targetId,
    outcomeId = outcomeId,
    restrictPlpDataSettings = restrictPlpDataSettings,
    covariateSettings = covariateSettings,
    populationSettings = populationSettings,
    sampleSettings = sampleSettings,
    featureEngineeringSettings = featureEngineeringSettings,
    preprocessSettings = preprocessSettings,
    modelSettings = modelSettings,
    executeSettings = createExecuteSettings(
      runSplitData = T,
      runSampleData = useSample,
      runfeatureEngineering = useFE,
      runPreprocessData = usePreprocess,
      runModelDevelopment = !is.null(modelSettings),
      runCovariateSummary =  runCovariateSummary
    )
    
  )
  
  class(settings) <- 'modelDesign'
  return(settings)
  
}


#' Save the modelDesignList to a json file
#'
#' @details
#' This function creates a json file with the modelDesignList saved
#' 
#' @param modelDesignList          A list of modelDesigns created using \code{createModelDesign()}
#' @param saveDirectory            The directory to save the modelDesignList settings
#'                                       
#' @examples
#' \dontrun{
#' savePlpAnalysesJson(
#' modelDesignList = list(
#' createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), 
#' createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
#' ),
#' saveDirectory = 'C:/bestModels'
#' )
#' }
#'
#' @export
savePlpAnalysesJson <- function(
  modelDesignList = list(
  createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()), 
  createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
  ),
  saveDirectory = NULL
  ){
  
  if(class(modelDesignList) == 'modelDesign'){
    modelDesignList <- list(modelDesignList)
  }
  
  lapply(modelDesignList, function(x){checkIsClass(x, 'modelDesign')})

  
  # save this as a json
  modelDesignList <- lapply(modelDesignList, function(x) prepareToJson(x))
  jsonSettings <- list(analyses = modelDesignList) # TODO: rename this ModelDesignList?
  
  if(!is.null(saveDirectory)){
    checkIsClass(saveDirectory, 'character')
    
    if(!dir.exists(saveDirectory)){
      dir.create(saveDirectory, recursive = T)
    }
    
    modelDesignList <- jsonlite::toJSON(
      x = jsonSettings, 
      pretty = T, 
      digits = 23, 
      auto_unbox=TRUE, 
      null = "null"
      )
    write(modelDesignList, file.path(saveDirectory,"predictionAnalysisList.json"))
    
    # should we add splitSettings to this and the input?
    
    return(file.path(saveDirectory,"predictionAnalysisList.json")) 
  }
  
  return(jsonSettings)
}


#' Load the multiple prediction json settings from a file
#'
#' @details
#' This function interprets a json with the multiple prediction settings and creates a list 
#' that can be combined with connection settings to run a multiple prediction study
#' 
#' @param jsonFileLocation    The location of the file 'predictionAnalysisList.json' with the modelDesignList  
#'                                      
#' @examples
#' \dontrun{
#' modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis
#' }
#'
#' @export
loadPlpAnalysesJson <- function(
  jsonFileLocation 
){
  
  checkIsClass(jsonFileLocation, 'character')
  if(!file.exists(jsonFileLocation)){
    ParallelLogger::logError('Invalid directory - does not exist')
  }
  
  if(!file.exists(file.path(jsonFileLocation))){
    ParallelLogger::logError('predictionAnalysisList.json not found ')
  }
  
  
  json <- tryCatch(
    {readChar(jsonFileLocation, file.info(jsonFileLocation)$size)},
    error= function(cond) {
      ParallelLogger::logInfo('Issue with loading json file...');
      ParallelLogger::logError(cond)
    })
  json <- tryCatch(
    {jsonlite::fromJSON(json, simplifyVector = T, simplifyDataFrame = F, simplifyMatrix = T)},
    error = function(cond) {
      ParallelLogger::logInfo('Issue with parsing json object...');
      ParallelLogger::logError(cond)
    })
  json$analyses <- tryCatch(
    {lapply(json$analyses, function(x) prepareToRlist(x))},
    error = function(cond) {
      ParallelLogger::logInfo('Issue converting json to R list...');
      ParallelLogger::logError(cond)
    })
  
  # if splitSettings in json
  if('splitSettings' %in% names(json)){
    # update the splitsetting (move this into load/saveplpAnalysis)
    if('attributes' %in% names(json$splitSettings)){
      atts <- json$splitSettings$attributes
      json$splitSettings$attributes <- NULL
      attributes(json$splitSettings) <- atts
    }
  }
  
  return(json)
  
}





#' externally validate the multiple plp models across new datasets
#' @description
#' This function loads all the models in a multiple plp analysis folder and
#' validates the models on new data
#' @details
#' Users need to input a location where the results of the multiple plp analyses
#' are found and the connection and database settings for the new data
#' 
#' @param analysesLocation                The location where the multiple plp analyses are
#' @param validationDatabaseDetails       The validation database settings created using \code{createDatabaseDetails()}
#' @param validationRestrictPlpDataSettings  The settings specifying the extra restriction settings when extracting the data created using \code{createRestrictPlpDataSettings()}.
#' @param recalibrate                      A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')
#' @param saveDirectory               The location to save to validation results
#' 
#' @export 
validateMultiplePlp <- function(
  analysesLocation,
  validationDatabaseDetails,
  validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
  recalibrate = NULL,
  saveDirectory = NULL
  ){

  # add input checks 
  checkIsClass(analysesLocation, 'character')
  
  checkIsClass(validationDatabaseDetails, 'databaseDetails')
  checkIsClass(validationRestrictPlpDataSettings, 'restrictPlpDataSettings')
  
  checkIsClass(recalibrate, c('character', 'NULL'))
  checkIsClass(saveDirectory, c('character', 'NULL'))

  # for each model run externalValidateDbPlp()
  analyses <- dir(analysesLocation, recursive = F, full.names = F)
  
  # now fine all analysis folders..
  analyses <- analyses[grep('Analysis_',analyses)]
  
  for(i in 1:length(analyses)){
    
    if(is.null(saveDirectory)){
      saveLocation <- file.path(analysesLocation, 'Validation')
    } else{
      saveLocation <- saveDirectory
    }
    
    analysis <- analyses[i]
    modelSettings <- file.path(analysesLocation, analysis)
    
    ParallelLogger::logInfo(paste0('Evaluating model in ',modelSettings ))
    
    if(dir.exists(file.path(modelSettings[i],'plpResult'))){
      ParallelLogger::logInfo(paste0('plpResult found in ',modelSettings ))
      
      plpModel <- loadPlpModel(file.path(modelSettings,'plpResult','model'))
      
      validations <-   tryCatch(
        {
          externalValidateDbPlp(
            plpModel = plpModel,
            validationDatabaseDetails = validationDatabaseDetails,
            validationRestrictPlpDataSettings = validationRestrictPlpDataSettings,
            settings = createValidationSettings(
              recalibrate = recalibrate
            ),
            outputFolder = saveLocation
          )},
        error = function(cont){ParallelLogger::logInfo(paste0('Error: ',cont ))
          ;return(NULL)}
      )
      
    }
  }
  
}















# HELPERS
#===============================
getidList <- function(modelDesignList){
  
  types <- c(
    'targetId', 
    'outcomeId', 
    'restrictPlpDataSettings',
    'covariateSettings', 
    'populationSettings', 
    'sampleSettings',
    'featureEngineeringSettings',
    'preprocessSettings',
    'modelSettings',
    'executeSettings'
  )
  
  idList <- list()
  length(idList) <- length(types)
  names(idList) <- types
  
  for(type in types){
    idList[[type]] <- getSettingValues(modelDesignList, type = type )
  }
  
  return(idList)
}


getSettingValues <- function(modelDesignList, type = 'cohortId' ){
  
  if(class(modelDesignList) == 'list'){
    values <- unique(unlist(lapply(modelDesignList, function(x)jsonlite::serializeJSON(x[[type]])))
      )
  } else{
    values <- jsonlite::serializeJSON(modelDesignList[[type]])
  }
  
  if(! type %in% c('targetId', 'outcomeId')  ){
    result <- data.frame(
      value = values,
      id = 1:length(values)
    )
  } else{
    result <- data.frame(
      value = sapply(values, function(x) jsonlite::unserializeJSON(x)),
      id = sapply(values, function(x) jsonlite::unserializeJSON(x))
    )
  }
  
  return(result)
}

# get the ids for the model design settings
getIdsForSetting <- function(modelDesign, idList){
  
  ids <- c()
  
  for(settingType in names(idList)){
    
    if(!settingType %in% c('targetId', 'outcomeId')){
    # get the index of the setting matching the design setting
      ind <- which(idList[[settingType]]$value == jsonlite::serializeJSON(modelDesign[[settingType]]))
    } else{
      ind <- which(idList[[settingType]]$value == modelDesign[[settingType]])
    }
    # get the id
    id <- idList[[settingType]]$id[ind]
    
    ids <- c(ids, id)
  }
  
  names(ids) <- names(idList)
  
  return(ids)
}


# this creates a data.frame with the analysisId and settingsId for each analysis
# need to add the data location to this
getSettingsTable <- function(modelDesignList, idList){
  
  result <- lapply(modelDesignList, function(x) getIdsForSetting(x, idList) )
  settingsTable <- do.call(rbind, result)
  settingsTable <- as.data.frame(settingsTable)
  
  settingsTable$analysisId <- paste0('Analysis_', 1:nrow(settingsTable))
  
  settingsTable$dataLocation <- paste0('T_',settingsTable$targetId, '_L_', settingsTable$covariateSettings*settingsTable$restrictPlpDataSettings)
  
  return(settingsTable)
}


getSettingFromId <- function(
  idList, 
  type, 
  id
){
  ind <- which(idList[[type]]$id == id)
  if(!type %in% c('targetId', 'outcomeId')){
    return(jsonlite::unserializeJSON(as.character(idList[[type]]$value[[ind]])))
  } else{
    return(idList[[type]]$value[[ind]])
  }
}


getDataSettings <- function(settingstable){
  
  combos <- settingstable %>% 
    dplyr::distinct(.data$targetId,.data$covariateSettings,.data$restrictPlpDataSettings,.data$dataLocation)
  
  result <- list()
  length(result) <- nrow(combos)
  for(i in 1:nrow(combos)){
    result[[i]] <- list(
      targetId = combos$targetId[i],
      covariateSettings = combos$covariateSettings[i],
      restrictPlpDataSettings = combos$restrictPlpDataSettings[i],
      dataLocation = combos$dataLocation[i],
      outcomeIds = settingstable %>%  
        dplyr::filter(.data$dataLocation == combos$dataLocation[i]) %>%  
        dplyr::select(.data$outcomeId) %>%  
        dplyr::pull()
    )
  }
  return(result)
}

getNames <- function(
  cohortDefinitions, 
  ids
){
  
  idNames <- lapply(cohortDefinitions, function(x) c(x$id, x$name))
  idNames <- do.call(rbind, idNames)
  colnames(idNames) <- c('id', 'name')
  idNames <- as.data.frame(idNames)
  
  nams <- c()
  for(id in ids){
    nams <- c(nams, idNames$name[idNames$id == id])
  }
  
  return(nams)
  
}
quinterpriest/PatientLevelPrediction documentation built on April 20, 2022, 12:50 a.m.