plp-git-demo: Package for patient level prediction using data in the OMOP Common Data Model

Documented in predictAndromeda predictPlp predictProbabilities

# @file predict.R
#
# Copyright 2020 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' predictPlp
#'
#' @description
#' Predict the risk of the outcome using the input plpModel for the input plpData
#' @details
#' The function applied the trained model on the plpData to make predictions
#' @param plpModel                         An object of type \code{plpModel} - a patient level prediction model
#' @param population                       The population created using createStudyPopulation() who will have their risks predicted
#' @param plpData                          An object of type \code{plpData} - the patient level prediction
#'                                         data extracted from the CDM.
#' @param index                            A data frame containing rowId: a vector of rowids and index: a vector of doubles the same length as the rowIds. If used, only the rowIds with a negative index value are used to calculate the prediction.  
#' 
#' @return
#' A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.
#'

# parent predict that calls the others
#' @export
predictPlp <- function(plpModel, population, plpData,  index=NULL){
  
  if(is.null(plpModel))
    stop('No model input')
  if(is.null(population))
    stop('No population input')
  if(is.null(plpData))
    stop('No plpData input')
  
  # check logger
  if(length(ParallelLogger::getLoggers())==0){
    logger <- ParallelLogger::createLogger(name = "SIMPLE",
                                           threshold = "INFO",
                                           appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
    ParallelLogger::registerLogger(logger)
  }
  
  # apply the feature transformations
  if(!is.null(index)){
    ParallelLogger::logTrace(paste0('Calculating prediction for ',sum(index$index<0),' in test set'))
    ind <- population$rowId%in%index$rowId[index$index<0]
  } else{
    ParallelLogger::logTrace(paste0('Calculating prediction for ',nrow(population),' in dataset'))
    ind <- rep(T, nrow(population))
  }
  
  # do the predction on the new data
  if(class(plpModel)=='plpModel'){
    # extract the classifier type
    prediction <- plpModel$predict(plpData=plpData,population=population[ind,])
    
    if(nrow(prediction)!=nrow(population[ind,]))
      ParallelLogger::logWarn(paste0('Dimension mismatch between prediction and population test cases.  Population test: ',nrow(population[ind, ]), '-- Prediction:', nrow(prediction) ))
  } else{
    ParallelLogger::logError('Non plpModel input')
    stop()
  }
  
  metaData <- list(predictionType="binary",
                   cohortId = attr(population,'metaData')$cohortId,
                   outcomeId = attr(population,'metaData')$outcomeId)
  
  attr(prediction, "metaData") <- metaData
  return(prediction)
}

# default patient level prediction prediction  
predict.plp <- function(plpModel,population, plpData, ...){
  covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
  ParallelLogger::logTrace('predict.plp - predictingProbabilities start')
  prediction <- predictProbabilities(plpModel$model, population, covariateData)
  ParallelLogger::logTrace('predict.plp - predictingProbabilities end')
  return(prediction)
}

# for gxboost
predict.xgboost <- function(plpModel,population, plpData, ...){ 
  result <- toSparseM(plpData, population, map=plpModel$covariateMap)
  data <- result$data[population$rowId,]
  prediction <- data.frame(rowId=population$rowId,
                           value=stats::predict(plpModel$model, data)
  )
  
  prediction <- merge(population, prediction, by='rowId', all.x=T, fill=0)
  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value')] # need to fix no index issue
  attr(prediction, "metaData") <- list(predictionType = "binary") 
  return(prediction)
  
}

predict.pythonReticulate <- function(plpModel, population, plpData){
  
  e <- environment()
  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
  
  
  ParallelLogger::logInfo('Mapping covariates...')
  if(!is.null(plpData$timeRef)){
    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
    pdata <- pdata$data
    fun_predict <- python_predict_temporal
  } else {  
    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
    pdata <- reticulate::r_to_py(newData$data[,included])
    fun_predict <- python_predict
  }
  
  # save population
  if('indexes'%in%colnames(population)){
    population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
    pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
    
  } else {
    population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
    pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount')])
  }
  
  # run the python predict code:
  ParallelLogger::logInfo('Executing prediction...')
  result <- fun_predict(population = pPopulation, 
                        plpData = pdata, 
                        model_loc = plpModel$model,
                        dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
                        autoencoder = F)
  
  #get the prediction from python and reformat:
  ParallelLogger::logInfo('Returning results...')
  prediction <- result
  prediction <- as.data.frame(prediction)
  attr(prediction, "metaData") <- list(predictionType="binary")
  if(ncol(prediction)==4){
    colnames(prediction) <- c('rowId','outcomeCount','indexes', 'value')
  } else {
    colnames(prediction) <- c('rowId','outcomeCount', 'value')
  }
  
  # add 1 to rowId from python:
  prediction$rowId <- prediction$rowId+1
  
  # add subjectId and date:
  prediction <- merge(prediction,
                      population[,c('rowId','subjectId','cohortStartDate')], 
                      by='rowId')
  return(prediction)
}

predict.pythonAuto <- function(plpModel, population, plpData){
  
  ParallelLogger::logInfo('Mapping covariates...')
  if(!is.null(plpData$timeRef)){
    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
    pdata <- pdata$data
  } else {  
    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
    pdata <- reticulate::r_to_py(newData$data[,included])
  }
  
  # save population
  if('indexes'%in%colnames(population)){
    population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
    pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
    
  } else {
    population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
    pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount')])
  }
  
  # run the python predict code:
  ParallelLogger::logInfo('Executing prediction...')
  e <- environment()
  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
  
  result <- python_predict(population = pPopulation, 
                           plpData = pdata, 
                           model_loc = plpModel$model,
                           dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
                           autoencoder = T)
  #get the prediction from python and reformat:
  ParallelLogger::logInfo('Returning results...')
  prediction <- result
  prediction <- as.data.frame(prediction)
  attr(prediction, "metaData") <- list(predictionType="binary")
  if(ncol(prediction)==4){
    colnames(prediction) <- c('rowId','outcomeCount','indexes', 'value')
  } else {
    colnames(prediction) <- c('rowId','outcomeCount', 'value')
  }
  
  # add 1 to rowId from python:
  prediction$rowId <- prediction$rowId+1
  
  # add subjectId and date:
  prediction <- merge(prediction,
                      population[,c('rowId','subjectId','cohortStartDate')], 
                      by='rowId')
  return(prediction)
}



predict.knn <- function(plpData, population, plpModel, ...){
  covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
  prediction <- BigKnn::predictKnn(covariates = covariateData$covariates,
                                   cohorts= population[,!colnames(population)%in%'cohortStartDate'],
                                   indexFolder = plpModel$model,
                                   k = plpModel$modelSettings$modelParameters$k,
                                   weighted = TRUE,
                                   threads = plpModel$modelSettings$threads)
  
  # can add: threads = 1 in the future
  
  # return the cohorts as a data frame with the prediction added as 
  # a new column with the column name 'value'
  prediction <- merge(population, prediction[,c('rowId','value')], by='rowId', 
                      all.x=T, fill=0)
  prediction$value[is.na(prediction$value)] <- 0
  
  return(prediction)
}


predict.deep <- function(plpModel, population, plpData,   ...){
  temporal <- !is.null(plpData$timeRef)
  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
  if(temporal){
    ParallelLogger::logTrace('temporal')
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
    
    data <-result$data[population$rowId,,]
    if(!is.null(plpModel$useVae)){
      if(plpModel$useVae==TRUE){
        data<- plyr::aaply(as.array(data), 2, function(x) predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
      }}
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    for(batch in batches){
      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
      if(is.null(dim(pred))){
        prediction$value[batch] <- as.double(pred)
      } else{
        prediction$value[batch] <- as.double(pred[,2])
      }
    }
    
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value')] # need to fix no index issue
    return(prediction)
  } else{
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
    data <-result$data[population$rowId,]
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    for(batch in batches){
      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,]))
      prediction$value[batch] <- as.double(pred[,2])
    }
    
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value')] # need to fix no index issue
    return(prediction)
    
  }
}

predict.BayesianDeep <- function(plpModel, population, plpData,   ...){
  temporal <- !is.null(plpData$timeRef)
  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
  if(temporal){
    ParallelLogger::logTrace('temporal')
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
    
    data <-result$data[population$rowId,,]
    if(!is.null(plpModel$useVae)){
      if(plpModel$useVae==TRUE){
        data<- plyr::aaply(as.array(data), 2, function(x) predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
      }}
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    prediction$epistemicUncertainty <- 0
    prediction$aleatoricUncertainty <- 0
    for(batch in batches){
      num_MC_samples = 100
      output_dim = 2
      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
      for (k in 1:num_MC_samples){
        MC_samples[k,, ] = predict(plpModel$model, as.array(data[batch,,]))
        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
      }
      pred <- apply(MC_samples[,,output_dim], 2, mean)
      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, var)
      logVar = MC_samples[, , output_dim * 2]
      if(length(dim(logVar))<=1){
        aleatoricUncertainty = exp(mean(logVar))
      }else{
        aleatoricUncertainty = exp(colMeans(logVar))
        
      }
      prediction$value[batch] <- pred
      prediction$epistemicUncertainty[batch] = epistemicUncertainty
      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
      #writeLines(paste0(dim(pred[,2]), collapse='-'))
      #writeLines(paste0(pred[1,2], collapse='-'))
      
    }
    prediction$value[prediction$value>1] <- 1
    prediction$value[prediction$value<0] <- 0
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value',
                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
    return(prediction)
  } else{
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
    data <-result$data[population$rowId,]
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    for(batch in batches){
      num_MC_samples = 100
      output_dim =2
      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
      for (k in 1:num_MC_samples){
        MC_samples[k,, ] = predict(plpModel$model, as.array(data[batch,,]))
        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
      }
      pred <- apply(MC_samples[,,output_dim], 2, mean)
      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, var)
      logVar = MC_samples[, , output_dim * 2]
      if(length(dim(logVar))<=1){
        aleatoricUncertainty = exp(mean(logVar))
      }else{
        aleatoricUncertainty = exp(colMeans(logVar))
        
      }
      prediction$value[batch] <- pred
      prediction$epistemicUncertainty[batch] = epistemicUncertainty
      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
      #writeLines(paste0(dim(pred[,2]), collapse='-'))
      #writeLines(paste0(pred[1,2], collapse='-'))
    }
    prediction$value[prediction$value>1] <- 1
    prediction$value[prediction$value<0] <- 0
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value',
                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
    return(prediction)
    
  }
}

predict.deepEnsemble <- function(plpModel, population, plpData,   ...){
  temporal <- !is.null(plpData$timeRef)
  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
  if(temporal){
    ParallelLogger::logTrace('temporal')
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
    
    data <-result$data[population$rowId,,]
    if(!is.null(plpModel$useVae)){
      if(plpModel$useVae==TRUE){
        data<- plyr::aaply(as.array(data), 2, function(x) predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
      }}
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    
    prediction$sigmas <- 0
    for(batch in batches){
      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
        if(i==1){
          muMatrix <- data.frame()
          sigmaMatrix <-data.frame()
        }
        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
      }
      
      muMean <- apply(muMatrix,2,mean)
      muSq <- muMatrix^2
      sigmaSq <- sigmaMatrix^2
      sigmaMean <- apply(sigmaMatrix,2,mean)
      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
      
      prediction$value[batch] <- c(muMean)
      prediction$sigmas[batch] <- c(sigmaResult)
    }
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas')] # need to fix no index issue
    #If the prediction value is negative, please add values to make all the values to positive. 
    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
    return(prediction)
    
  } else{
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
    data <-result$data[population$rowId,]
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    
    prediction$sigmas <- 0
    for(batch in batches){
      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
        if(i==1){
          muMatrix <- data.frame()
          sigmaMatrix <-data.frame()
        }
        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
      }
      
      muMean <- apply(muMatrix,2,mean)
      muSq <- muMatrix^2
      sigmaSq <- sigmaMatrix^2
      sigmaMean <- apply(sigmaMatrix,2,mean)
      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
      
      prediction$value[batch] <- c(muMean)
      prediction$sigmas[batch] <- c(sigmaResult)
    }
    
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas')] # need to fix no index issue
    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
    return(prediction)
    
  }
}

predict.deepMulti <- function(plpModel, population, plpData,   ...){
  
  repeats <- attr(plpModel, 'inputs')
  
  temporal <- !is.null(plpData$timeRef)
  ParallelLogger::logDebug('timeRef null: ',paste0(is.null(plpData$timeRef)))
  if(temporal){
    ParallelLogger::logTrace('temporal')
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
    ParallelLogger::logDebug('result$data dim: ',paste0(dim(result$data), collapse = '-'))
    ParallelLogger::logDebug('population dim: ',paste0(dim(population), collapse = '-'))
    data <-result$data[population$rowId,,]
    
    ParallelLogger::logInfo('running batch prediction')
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    for(batch in batches){
      ParallelLogger::logDebug('batch length: ', length(batch))
      dat <- list()
      length(dat) <- repeats
      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
      
      pred <- keras::predict_on_batch(plpModel$model, dat)
      if(is.null(dim(pred))){
        ParallelLogger::logDebug('Pred length: ', length(pred))
        prediction$value[batch] <- as.double(pred)
      } else{
        ParallelLogger::logDebug('Pred dim: ', paste0(dim(pred), collapse = '-'))
        prediction$value[batch] <- as.double(pred[,2])
      }
    }
    
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value')] # need to fix no index issue
    return(prediction)
  } else{
    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
    data <-result$data[population$rowId,]
    
    batch_size <- min(2000, length(population$rowId))
    maxVal <- length(population$rowId)
    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
    prediction <- population
    prediction$value <- 0
    for(batch in batches){
      dat <- list()
      length(dat) <- repeats
      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
      
      pred <- keras::predict_on_batch(plpModel$model, dat)
      prediction$value[batch] <- as.double(pred)
    }
    
    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value')] # need to fix no index issue
    return(prediction)
    
  }
}


#' Create predictive probabilities
#'
#' @details
#' Generates predictions for the population specified in plpData given the model.
#'
#' @return
#' The value column in the result data.frame is: logistic: probabilities of the outcome, poisson:
#' Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.
#'
#' @param predictiveModel   An object of type \code{predictiveModel} as generated using
#'                          \code{\link{fitPlp}}.
#' @param population        The population to calculate the prediction for
#' @param covariates        The covariate part of PlpData containing the covariates for the population
#' @export
predictProbabilities <- function(predictiveModel, population, covariateData) {
  start <- Sys.time()
  
  ParallelLogger::logTrace('predictProbabilities - predictAndromeda start')
  prediction <- predictAndromeda(predictiveModel$coefficients,
                            population,
                            covariateData,
                            predictiveModel$modelType)
  ParallelLogger::logTrace('predictProbabilities - predictAndromeda end')
  prediction$time <- NULL
  attr(prediction, "modelType") <- predictiveModel$modelType
  attr(prediction, "cohortId") <- attr(population, "metadata")$cohortId
  attr(prediction, "outcomeId") <- attr(population, "metadata")$outcomeId
  
  delta <- Sys.time() - start
  ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units"))
  return(prediction)
}

#' Generated predictions from a regression model
#'
#' @param coefficients   A names numeric vector where the names are the covariateIds, except for the
#'                       first value which is expected to be the intercept.
#' @param population       A data frame containing the population to do the prediction for
#' @param covariateData     An andromeda object containing the covariateData with predefined columns
#'                       (see below).
#' @param modelType      Current supported types are "logistic", "poisson", "cox" or "survival".
#'
#' @details
#' These columns are expected in the outcome object: \tabular{lll}{ \verb{rowId} \tab(integer) \tab
#' Row ID is used to link multiple covariates (x) to a single outcome (y) \cr \verb{time} \tab(real)
#' \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr \tab
#' \tab(e.g. number of days) \cr } These columns are expected in the covariates object: \tabular{lll}{
#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome
#' (y) \cr \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr
#' \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr }
#'
#' @export
predictAndromeda <- function(coefficients, population, covariateData, modelType = "logistic") {
  if (!(modelType %in% c("logistic", "poisson", "survival","cox"))) {
    stop(paste("Unknown modelType:", modelType))
  }
  if (!FeatureExtraction::isCovariateData(covariateData)){
    stop("Needs correct covariateData")
  }
  
  intercept <- coefficients[names(coefficients)%in%'(Intercept)']
  if(length(intercept)==0) intercept <- 0
  coefficients <- coefficients[!names(coefficients)%in%'(Intercept)']
  coefficients <- data.frame(beta = as.numeric(coefficients),
                             covariateId = as.numeric(names(coefficients)))
  coefficients <- coefficients[coefficients$beta != 0, ]
  if(sum(coefficients$beta != 0)>0){
    covariateData$coefficients <- coefficients
    
    prediction <- covariateData$covariates %>% 
      dplyr::inner_join(covariateData$coefficients, by= 'covariateId') %>% 
      dplyr::mutate(values = covariateValue*beta) %>%
      dplyr::group_by(rowId) %>%
      dplyr::summarise(value = sum(values, na.rm = TRUE)) %>%
      dplyr::select(rowId, value)
    
    prediction <- as.data.frame(prediction)
    prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0)
    prediction$value[is.na(prediction$value)] <- 0
    prediction$value <- prediction$value + intercept
  } else{
    warning('Model had no non-zero coefficients so predicted same for all population...')
    prediction <- population
    prediction$value <- rep(0, nrow(population)) + intercept
  }
  if (modelType == "logistic") {
    link <- function(x) {
      return(1/(1 + exp(0 - x)))
    }
    prediction$value <- link(prediction$value)
  } else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") {
    prediction$value <- exp(prediction$value)
    if(max(prediction$value)>1){
      prediction$value <- prediction$value/max(prediction$value)
    }
  }
  return(prediction)
}
hxia/plp-git-demo documentation built on March 19, 2021, 1:54 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hxia/plp-git-demo
Package for patient level prediction using data in the OMOP Common Data Model

R/Predict.R
In hxia/plp-git-demo: Package for patient level prediction using data in the OMOP Common Data Model

Defines functions predictAndromeda predictProbabilities predict.deepMulti predict.deepEnsemble predict.BayesianDeep predict.deep predict.knn predict.pythonAuto predict.pythonReticulate predict.xgboost predict.plp predictPlp

Documented in predictAndromeda predictPlp predictProbabilities

R Package Documentation

Browse R Packages

We want your feedback!

hxia/plp-git-demo Package for patient level prediction using data in the OMOP Common Data Model

R/Predict.R In hxia/plp-git-demo: Package for patient level prediction using data in the OMOP Common Data Model

Defines functions predictAndromeda predictProbabilities predict.deepMulti predict.deepEnsemble predict.BayesianDeep predict.deep predict.knn predict.pythonAuto predict.pythonReticulate predict.xgboost predict.plp predictPlp

Documented in predictAndromeda predictPlp predictProbabilities

R Package Documentation

Browse R Packages

We want your feedback!

hxia/plp-git-demo
Package for patient level prediction using data in the OMOP Common Data Model

R/Predict.R
In hxia/plp-git-demo: Package for patient level prediction using data in the OMOP Common Data Model