# @file Fit.R
#
# Copyright 2020 Observational Health Data Sciences and Informatics
#
# This file is part of CohortMethod
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' fitPlp
#'
#' @description
#' Train various models using a default parameter gird search or user specified parameters
#'
#' @details
#' The user can define the machine learning model to train (regularised logistic regression, random forest,
#' gradient boosting machine, neural network and )
#'
#' @param population The population created using createStudyPopulation() who will have their risks predicted
#' @param data An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item{logisticRegressionModel()}{ A lasso logistic regression model}
#' \item{GBMclassifier()}{ A gradient boosting machine}
#' \item{RFclassifier()}{ A random forest model}
#' \item{GLMclassifier ()}{ A generalised linear model}
#' \item{KNNclassifier()}{ A KNN model}
#' }
#' @param cohortId Id of study cohort
#' @param outcomeId Id of outcome cohort
#' @param minCovariateFraction The minimum fraction of the target popualtion who have a variable for it to be included in the model training
#' @param normalizeData Whether to normalise the data before model fitting
#' @return
#' An object of class \code{plpModel} containing:
#'
#' \item{model}{The trained prediction model}
#' \item{modelLoc}{The path to where the model is saved (if saved)}
#' \item{trainAuc}{The AUC obtained on the training set}
#' \item{trainCalibration}{The calibration obtained on the training set}
#' \item{modelSettings}{A list specifiying the model, preprocessing, outcomeId and cohortId}
#' \item{metaData}{The model meta data}
#' \item{trainingTime}{The time taken to train the classifier}
#'
#'
#' @export
fitPlp <- function(population, data, modelSettings,#featureSettings,
cohortId, outcomeId, minCovariateFraction=0.001, normalizeData=T){
if(is.null(population))
stop('Population is NULL')
if(is.null(data))
stop('plpData is NULL')
if(is.null(modelSettings$model))
stop('No model specified')
plpData <- data
#=========================================================
# run through pipeline list and apply:
#=========================================================
# normalise the data:
removeRedundancy <- ifelse("timeId" %in%colnames(plpData$covariateData$covariates), F, T)
plpData$covariateData <- tryCatch({
suppressWarnings(FeatureExtraction::tidyCovariateData(covariateData=data$covariateData,
minFraction = minCovariateFraction,
normalize = normalizeData,
removeRedundancy = removeRedundancy))
})
# get the pre-processing settings
##preprocessSettings <- attr(newCovariateData, "metaData")
# normFactors, deletedRedundantCovariateIds, deletedInfrequentCovariateIds
# Now apply the classifier:
fun <- modelSettings$model
args <- list(plpData =plpData,param =modelSettings$param,
population=population, cohortId=cohortId, outcomeId=outcomeId)
plpModel <- do.call(fun, args)
ParallelLogger::logTrace('Returned from classifier function')
# add pre-processing details
plpModel$metaData$preprocessSettings <- attr(plpData$covariateData, "metaData")
ParallelLogger::logTrace('Creating prediction function')
plpModel$predict <- createTransform(plpModel)
ParallelLogger::logTrace('Adding index')
plpModel$index <- population$indexes ##?- dont think we need this, just the seed instead
class(plpModel) <- 'plpModel'
return(plpModel)
}
#NEED TO UPDATE....
# fucntion for implementing the pre-processing (normalisation and redundant features removal)
applyTidyCovariateData <- function(covariateData,preprocessSettings){
if (!FeatureExtraction::isCovariateData(covariateData))
stop("Data not of class CovariateData")
newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
analysisRef = covariateData$analysisRef)
maxs <- preprocessSettings$normFactors
deleteRedundantCovariateIds <- preprocessSettings$deletedRedundantCovariateIds
deletedInfrequentCovariateIds <- preprocessSettings$deletedInfrequentCovariateIds
writeLines("Removing infrequent and redundant covariates and normalizing")
start <- Sys.time()
if('bins'%in%colnames(maxs)){
covariateData$maxes <- tibble::as_tibble(maxs) %>% dplyr::rename(covariateId = bins) %>%
dplyr::rename(maxValue = maxs)
} else{
covariateData$maxes <- maxs #tibble::as_tibble(maxs) %>% dplyr::rename(covariateId = bins)
}
on.exit(covariateData$maxes <- NULL, add = TRUE)
newCovariateData$covariates <- covariateData$covariates %>%
dplyr::filter(!covariateId %in%deletedInfrequentCovariateIds) %>%
dplyr::filter(!covariateId %in%deleteRedundantCovariateIds) %>%
dplyr::inner_join(covariateData$maxes, by = 'covariateId') %>%
dplyr::mutate(value = 1.0*covariateValue/maxValue) %>%
dplyr::select(-covariateValue) %>%
dplyr::rename(covariateValue = value)
delta <- Sys.time() - start
writeLines(paste("Removing infrequent and redundant covariates covariates and normalizing took", signif(delta, 3), attr(delta, "units")))
# return processed data
return(newCovariateData)
}
# create transformation function
createTransform <- function(plpModel){
#=============== edited this in last run
# remove index to save space
plpModel$index <- NULL
##plpModel$varImp <- NULL
# remove connection details for privacy
plpModel$metaData$call$connectionDetails <- NULL
#=====================
transform <- function(plpData=NULL, population=NULL){
#check model fitting makes sense:
if(ifelse(!is.null(attr(population, "metaData")$cohortId),attr(population, "metaData")$cohortId,-1)!=plpModel$cohortId)
warning('cohortId of new data does not match training data')
if(ifelse(!is.null(attr(population, "metaData")$outcomeId),attr(population, "metaData")$outcomeId,-1)!=plpModel$outcomeId)
warning('outcomeId of new data does not match training data or does not exist')
# apply normalsation to new data
plpData$covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
if(!is.null(plpModel$metaData$preprocessSettings)){
plpData$covariateData <- applyTidyCovariateData(plpData$covariateData,plpModel$metaData$preprocessSettings)
}
pred <- do.call(paste0('predict.',attr(plpModel, 'type')), list(plpModel=plpModel,
plpData=plpData,
population=population))
metaData <- list(trainDatabase = strsplit(do.call(paste, list(plpModel$metaData$call$cdmDatabaseSchema)),'\\.')[[1]][1],
testDatabase = strsplit(do.call(paste, list(plpData$metaData$call$cdmDatabaseSchema)),'\\.')[[1]][1],
studyStartDate = do.call(paste,list(plpModel$metaData$call$studyStartDate)),
studyEndDate = do.call(paste,list(plpModel$metaData$call$studyEndDate)),
cohortId = plpModel$cohortId,
outcomeId = plpModel$outcomeId,
predictionType ='binary'
)
attr(pred, 'metaData') <- metaData
return(pred)
}
return(transform)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.