R/EvaluatePlp.R

Defines functions diagnosticOddsRatio negativeLikelihoodRatio positiveLikelihoodRatio falseOmissionRate negativePredictiveValue falseDiscoveryRate positivePredictiveValue specificity falsePositiveRate falseNegativeRate sensitivity accuracy f1Score getDemographicSummary getPredictionDistribution getThresholdSummary getCalibration calibrationWeak calibrationInLarge averagePrecision calibrationLine brierScore computeAucFromDataFrames computeAuc evaluatePlp

Documented in accuracy averagePrecision brierScore calibrationLine computeAuc computeAucFromDataFrames diagnosticOddsRatio evaluatePlp f1Score falseDiscoveryRate falseNegativeRate falseOmissionRate falsePositiveRate getCalibration getPredictionDistribution getThresholdSummary negativeLikelihoodRatio negativePredictiveValue positiveLikelihoodRatio positivePredictiveValue sensitivity specificity

# @file Evaluate.R
#
# Copyright 2020 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' evaluatePlp
#'
#' @description
#' Evaluates the performance of the patient level prediction model
#' @details
#' The function calculates various metrics to measure the performance of the model
#' @param prediction                         The patient level prediction model's prediction
#' @param plpData                            The patient level prediction data
#' @return
#' A list containing the performance values
#'

#' @export
evaluatePlp <- function(prediction, plpData){

  # check logger
  if(length(ParallelLogger::getLoggers())==0){
    logger <- ParallelLogger::createLogger(name = "SIMPLE",
                                        threshold = "INFO",
                                        appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
    ParallelLogger::registerLogger(logger)
  }

  # checking inputs
  #========================================
  type <- attr(prediction, "metaData")$predictionType
  if (type != "binary") {
    stop('Currently only support binary classification models')
  }

  if(is.null(prediction$outcomeCount)){
    stop('No outcomeCount column present')
  }
  if(length(unique(prediction$value))==1){
    stop('Cannot evaluate as predictions all the same value')
  }
  #============================

  # auc
  ParallelLogger::logTrace('Calculating AUC')
  if(sum(prediction$outcomeCount>0) < 1000){
    auc <- computeAuc(prediction, confidenceInterval = T)
    ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUC: ', auc[1]*100))
    ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% lower AUC: ', auc[2]*100))
    ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% upper AUC: ', auc[3]*100))
  } else{
    # speed issues with big data so using AUC package
    auc <- data.frame(auc = AUC::auc(AUC::roc(prediction$value, factor(prediction$outcomeCount))),
                      auc_lb95ci = NA,
                      auc_ub95ci = NA)
    ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUC: ', auc[1]*100))
  }

  # auprc
  ParallelLogger::logTrace('Calculating AUPRC')
  positive <- prediction$value[prediction$outcomeCount == 1]
  negative <- prediction$value[prediction$outcomeCount == 0]
  pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative)
  auprc <- pr$auc.integral
  ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUPRC: ', auprc*100))
  
  # brier scores-returnss; brier, brierScaled
  ParallelLogger::logTrace('Calculating Brier Score')
  brier <- brierScore(prediction)
  ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Brier: ', brier$brier))

  # 2) thresholdSummary
  # need to update thresholdSummary this with all the requested values
  ParallelLogger::logTrace(paste0('Calulating Threshold summary Started @ ',Sys.time()))
  thresholdSummary <-getThresholdSummary(prediction) # rename and edit this

  ParallelLogger::logTrace(paste0('Completed @ ',Sys.time()))
  
  # 3) demographicSummary
  ParallelLogger::logTrace(paste0('Calulating Demographic Based Evaluation Started @ ',Sys.time()))
  demographicSummary <- tryCatch(getDemographicSummary(prediction, plpData),
                                 error= function(cond){return(NULL)})
  ParallelLogger::logTrace(paste0('Completed @ ',Sys.time()))


  # calibration linear fit- returns gradient, intercept
  ParallelLogger::logTrace('Calculating Calibration-in-large')
  calinlarge <- calibrationInLarge(prediction)
  ParallelLogger::logInfo(paste0('Calibration in large- Mean predicted risk ', round(calinlarge$meanPredictionRisk, digits = 4), ' : observed risk ',round(calinlarge$observedRisk, digits = 4)))
  
  ParallelLogger::logTrace('Calculating Weak Calibration')
  weakCal <- calibrationWeak(prediction)
  ParallelLogger::logInfo(paste0('Weak calibration intercept: ', 
                                 round(weakCal$intercept, digits = 4), 
                                 ' - gradient:',round(weakCal$gradient, digits = 4)))
  
  ParallelLogger::logTrace('Calculating Hosmer-Lemeshow Calibration Line')
  calLine10 <- calibrationLine(prediction, numberOfStrata = 10)
  ParallelLogger::logInfo(sprintf('%-20s%.2f%-20s%.2f', 'Hosmer-Lemeshow calibration gradient: ', calLine10$lm[2], ' intercept: ',calLine10$lm[1]))
  # 4) calibrationSummary
  ParallelLogger::logTrace(paste0('Calculating Calibration Summary Started @ ',Sys.time()))
  calibrationSummary <- getCalibration(prediction,
                                       numberOfStrata = 100,
                                       truncateFraction = 0.01)
  ParallelLogger::logTrace(paste0('Completed @ ',Sys.time()))

  # 5) predictionDistribution - done
  ParallelLogger::logTrace(paste0('Calculating Quantiles Started @ ',Sys.time()))
  predictionDistribution <- getPredictionDistribution(prediction)
  ParallelLogger::logTrace(paste0('Completed @ ',Sys.time()))

  # Extra: Average Precision
  aveP.val <- averagePrecision(prediction)
  ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Average Precision: ', aveP.val))

  # evaluationStatistics:
  evaluationStatistics <- list(analysisId= attr(prediction, "metaData")$analysisId,
                               populationSize = nrow(prediction),
                               outcomeCount = sum(prediction$outcomeCount),
                               # need to add analysisId to metaData!
                               AUC= auc,
                               AUPRC = auprc,
                               BrierScore = brier$brier,	
                               BrierScaled= brier$brierScaled,	
                               CalibrationIntercept= weakCal$intercept,	
                               CalibrationSlope = weakCal$gradient,
                               CalibrationInLarge = calinlarge$meanPredictionRisk/calinlarge$observedRisk)

  result <- list(evaluationStatistics= evaluationStatistics,
                 thresholdSummary= thresholdSummary,
                 demographicSummary = demographicSummary,
                 calibrationSummary = calibrationSummary,
                 predictionDistribution = predictionDistribution
  )
  class(result) <- 'plpEvaluation'
  return(result)

}

#' Compute the area under the ROC curve
#'
#' @details
#' Computes the area under the ROC curve for the predicted probabilities, given the true observed
#' outcomes.
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predict}} functions.
#' @param confidenceInterval    Should 95 percebt confidence intervals be computed?
#'
#' @export
computeAuc <- function(prediction,
                       confidenceInterval = FALSE) {
  if (attr(prediction, "metaData")$predictionType != "binary")
    stop("Computing AUC is only implemented for binary classification models")

  if (confidenceInterval) {
    auc <- aucWithCi(prediction$value, prediction$outcomeCount)
    return(data.frame(auc = auc[1], auc_lb95ci = auc[2], auc_ub95ci = auc[3])) # edited 3rd to be ub?
  } else {
    auc <- aucWithoutCi(prediction$value, prediction$outcomeCount)
    return(auc)
  }
}

#' Compute the area under the ROC curve
#'
#' @details
#' Computes the area under the ROC curve for the predicted probabilities, given the true observed
#' outcomes.
#'
#' @param prediction           A vector with the predicted hazard rate.
#' @param status               A vector with the status of 1 (event) or 0 (no event).
#' @param time                 Only for survival models: a vector with the time to event or censor
#'                             (which ever comes first).
#' @param confidenceInterval   Should 95 percebt confidence intervals be computed?
#' @param timePoint            Only for survival models: time point when the AUC should be evaluated
#' @param modelType            Type of model. Currently supported are "logistic" and "survival".
#'
#' @export
computeAucFromDataFrames <- function(prediction,
                                     status,
                                     time = NULL,
                                     confidenceInterval = FALSE,
                                     timePoint,
                                     modelType = "logistic") {
  if (modelType == "survival" & confidenceInterval)
    stop("Currently not supporting confidence intervals for survival models")

  if (modelType == "survival") {
    Surv.rsp <- survival::Surv(time, status)
    Surv.rsp.new <- Surv.rsp
    if (missing(timePoint))
      timePoint <- max(time[status == 1])
    auc <- survAUC::AUC.uno(Surv.rsp, Surv.rsp.new, prediction, timePoint)$auc
    return(auc * auc)
  } else {
    if (confidenceInterval) {
      auc <- aucWithCi(prediction, status)
      return(data.frame(auc = auc[1], auc_lb95ci = auc[2], auc_lb95ci = auc[3]))
    } else {
      auc <- aucWithoutCi(prediction, status)
      return(auc)
    }
  }
}

#' brierScore
#'
#' @details
#' Calculates the brierScore from prediction object
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predictProbabilities}} function.
#'
#' @return
#' A list containing the brier score and the scaled brier score
#'
#' @export
brierScore <- function(prediction){

  brier <- sum((prediction$outcomeCount -prediction$value)^2)/nrow(prediction)
  brierMax <- mean(prediction$value)*(1-mean(prediction$value))
  brierScaled <- 1-brier/brierMax
  return(list(brier=brier,brierScaled=brierScaled))
}

#' calibrationLine
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predictProbabilities}} function.
#' @param numberOfStrata        The number of groups to split the prediction into
#'
#' @details
#' Calculates the calibration from prediction object
#'
#' @export
calibrationLine <- function(prediction,numberOfStrata=10){
  outPpl <- unique(prediction$rowId)

  q <- unique(stats::quantile(prediction$value, c((1:(numberOfStrata - 1))/numberOfStrata, 1)))

  if(length(unique(c(0,q)))==2){
    warning('Prediction not spread')
    #res <- c(0,0)
    #lmData <- NULL
    #hosmerlemeshow <-  c(0,0,0)
    prediction$strata <- cut(prediction$value,
                             breaks = c(-0.1,0.5,1), #,max(prediction$value)),
                             labels = FALSE)
  } else {
    prediction$strata <- cut(prediction$value,
                             breaks = unique(c(-0.1,q)), #,max(prediction$value)),
                             labels = FALSE)
  }

  # get observed events:
  obs.Points <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=mean)
  colnames(obs.Points) <- c('group','obs')
  pred.Points <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=mean)
  colnames(pred.Points) <- c('group','pred')

  # hosmer-lemeshow-goodness-of-fit-test
  obs.count <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=sum)
  colnames(obs.count) <- c('group','observed')
  expected.count <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=sum)
  colnames(expected.count) <- c('group','expected')
  hoslem <- merge(obs.count, expected.count, by='group')
  obs.count2 <- stats::aggregate(1-prediction$outcomeCount, by=list(prediction$strata), FUN=sum)
  colnames(obs.count2) <- c('group','observed')
  expected.count2 <- stats::aggregate(1-prediction$value, by=list(prediction$strata), FUN=sum)
  colnames(expected.count2) <- c('group','expected')
  nhoslem <- merge(obs.count2, expected.count2, by='group')
  Xsquared <- sum((hoslem$observed-hoslem$expected)^2/hoslem$expected) +
    sum((nhoslem$observed-nhoslem$expected)^2/nhoslem$expected)
  pvalue <- stats::pchisq(Xsquared, df=numberOfStrata-2, lower.tail = F)
  hosmerlemeshow <- data.frame(Xsquared=Xsquared, df=numberOfStrata-2, pvalue=pvalue)

  # linear model fitting obs to pred:
  lmData <- merge(obs.Points, pred.Points, by='group')
  model <- stats::lm(obs ~pred, data=lmData)

  ##graphics::plot(lmData$pred, lmData$obs)
  ##graphics::abline(a = model$coefficients[1], b = model$coefficients[2], col='red')
  res <- model$coefficients
  names(res) <- c('Intercept','Gradient')
  #

  result <- list(lm=res,
                 aggregateLmData = lmData,
                 hosmerlemeshow = hosmerlemeshow)
  return(result)
}

#' Calculate the average precision
#'
#' @details
#' Calculates the average precision from a predition object
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predictProbabilities}} function.
#'
#' @return
#' The average precision
#'
#' @export
averagePrecision <- function(prediction){
  lab.order <- prediction$outcomeCount[order(-prediction$value)]
  n <- nrow(prediction)
  P <- sum(prediction$outcomeCount>0)
  val <- rep(0, n)
  val[lab.order>0] <- 1:P
  return(sum(val/(1:n))/P)
}



calibrationInLarge <- function(prediction){
  
  result <- data.frame(meanPredictionRisk = mean(prediction$value),
                       observedRisk = sum(prediction$outcomeCount)/nrow(prediction),
                       N = nrow(prediction)
  )
  
  return(result)
}


calibrationWeak <- function(prediction){
  
  #do invert of log function:
  # log(p/(1-p))
  
  # edit the 0 and 1 values
  prediction$value[prediction$value==0] <- 0.000000000000001
  prediction$value[prediction$value==1] <- 1-0.000000000000001
  
  inverseLog <- log(prediction$value/(1-prediction$value))
  y <- ifelse(prediction$outcomeCount>0, 1, 0) 
  
  intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial'))
  intercept <- intercept$coefficients[1]
  gradient <- suppressWarnings(stats::glm(y ~ inverseLog+0, family = 'binomial',
                         offset = rep(intercept,length(inverseLog))))
  gradient <- gradient$coefficients[1]
  
  result <- data.frame(intercept = intercept, gradient = gradient)
  
  return(result)
}

#' Get a sparse summary of the calibration
#'
#' @details
#' Generates a sparse summary showing the predicted probabilities and the observed fractions. Predictions are
#' stratefied into equally sized bins of predicted probabilities.
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predict}} functions.
#' @param numberOfStrata        The number of strata in the plot.
#' @param truncateFraction      This fraction of probability values will be ignored when plotting, to
#'                              avoid the x-axis scale being dominated by a few outliers.
#'
#' @return
#' A dataframe with the calibration summary
#'
#' @export
getCalibration <- function(prediction,
                           numberOfStrata = 10,
                           truncateFraction = 0.01) {
  if (attr(prediction, "metaData")$predictionType != "binary")
    stop("Plotting the calibration is only implemented for binary classification models")

  q <- unique(stats::quantile(prediction$value, (1:(numberOfStrata - 1))/numberOfStrata))
  prediction$predictionThresholdId <- cut(prediction$value,
                                          breaks = unique(c(-0.00001, q, max(prediction$value))),
                                          labels = FALSE)

  prediction <- merge(prediction,
                      data.frame(predictionThresholdId=1:(length(q)+1), predictionThreshold=c(0, q)),
                      by='predictionThresholdId', all.x=T)

  computeStratumStats <- function(data) {
    return(data.frame(minx = min(data$value),
                      maxx = max(data$value),
                      fraction = sum(data$outcomeCount)/nrow(data)))
  }

  # count the number of persons with the age/gender strata
  PersonCountAtRisk <- stats::aggregate(rowId ~ predictionThreshold, data = prediction, length)
  names(PersonCountAtRisk)[2] <- "PersonCountAtRisk"

  # count the number of persons with the age/gender strata in T also in O at time-at-risk
  PersonCountWithOutcome <- stats::aggregate(outcomeCount ~ predictionThreshold, data = prediction, sum)
  names(PersonCountWithOutcome)[2] <- "PersonCountWithOutcome"

  strataData <- merge(PersonCountAtRisk,
                      PersonCountWithOutcome)

  # Select all persons within the predictionThreshold, compute their average predicted probability
  averagePredictedProbability <- stats::aggregate(prediction$value, list(prediction$predictionThreshold),
                                                  mean)
  colnames(averagePredictedProbability) <- c('predictionThreshold', 'averagePredictedProbability')
  strataData <- merge(strataData,averagePredictedProbability)

  StDevPredictedProbability <- stats::aggregate(prediction$value, list(prediction$predictionThreshold),
                                                sd)
  colnames(StDevPredictedProbability) <- c('predictionThreshold', 'StDevPredictedProbability')
  strataData <- merge(strataData, StDevPredictedProbability)

  # MinPredictedProbability, P25PredictedProbability, MedianPredictedProbability,
  # P75PredictedProbability, MaxPredictedProbability
  quantiles <- stats::aggregate(prediction$value, list(prediction$predictionThreshold),
                                function(x) stats::quantile(x, probs = c(0,0.25,0.5,0.75,1))
  )
  quantiles <- as.matrix(quantiles)
  colnames(quantiles) <- c('predictionThreshold', 'MinPredictedProbability',
                           'P25PredictedProbability', 'MedianPredictedProbability',
                           'P75PredictedProbability', 'MaxPredictedProbability')
  strataData <- merge(strataData, quantiles)

  # From among all persons in T within the age/gender strata, compute the proportion in O during the time-at-risk  (PersonCountWithOutcome / PersonCountAtRisk)
  # TODO: need to make sure PersonCountAtRisk is not zero - how to handle zeros?
  strataData$observedIncidence <- strataData$PersonCountWithOutcome/strataData$PersonCountAtRisk

  attr(strataData,'lims') <- stats::quantile(prediction$value, c(truncateFraction, 1 - truncateFraction))

  return(strataData)
}


#' Calculate all measures for sparse ROC
#'
#' @details
#' Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure
#' from a predition object
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predictProbabilities}} function.
#'
#' @return
#' A data.frame with all the measures
#'
#' @export
getThresholdSummary <- function(prediction){

  # do input checks
  if(nrow(prediction)<1){
    warning('sparse roc not calculated due to empty dataset')
    return(NULL)
  }

  n <- nrow(prediction)
  P <- sum(prediction$outcomeCount>0)
  N <- n - P

  if(N==0){
    warning('Number of negatives is zero')
    return(NULL)
  }
  if(P==0){
    warning('Number of positives is zero')
    return(NULL)
  }

  # add the preference score:
  proportion <- sum(prediction$outcomeCount)/nrow(prediction)
  # ISSUE WITH CAL # remove any predictions of 1
  prediction$value[prediction$value==1] <- 0.99999999
  x <- exp(log(prediction$value/(1 - prediction$value)) - log(proportion/(1 - proportion)))
  prediction$preferenceScore <- x/(x + 1)

  # get 100 points of distribution:
  # get the predictionThreshold and preferenceThreshold
  # add top 100 risk as well (for high ppv)
  predictionThreshold <- sort(c(prediction$value[order(-prediction$value)][1:100], 
                              stats::quantile(prediction$value, probs=seq(0,0.99,0.01),na.rm=TRUE)))
  preferenceThreshold <- sort(c(prediction$preferenceScore[order(-prediction$preferenceScore)][1:100],
                              stats::quantile(prediction$preferenceScore, seq(0,0.99,0.01),na.rm=TRUE)))
  
  # fix quantile bug when not enought unique values - this seems to get rid of issue
  for (val in names(table(predictionThreshold))[table(predictionThreshold)>1])
    predictionThreshold[predictionThreshold==val] <- as.double(val)

  # get the outcomeCount ordered by predicted value
  lab.order <- prediction$outcomeCount[order(-prediction$value)]
  valueOrdered <- prediction$value[order(-prediction$value)]
  # fix some rounding issue bug
  valueOrdered <-round(valueOrdered, digits = 10)
  predictionThreshold <- round(predictionThreshold, digits=10)
  # get the indexes for the predictionThreshold
  indexesOfInt <- sapply(predictionThreshold, function(x) min(which(valueOrdered<=x)))

  #TODO: FIGURE THIS BUG OUT # remove infs caused by R bug?
  if(sum(is.infinite(indexesOfInt))>0){
    imputeInd <- sapply(which(is.infinite(indexesOfInt)), function(x) max(which(which(!is.infinite(indexesOfInt))<x)))
    indexesOfInt[which(is.infinite(indexesOfInt))] <- indexesOfInt[imputeInd]
  }
  ##indexesOfInt <- indexesOfInt[!is.infinite(indexesOfInt)]

  # improve speed create vector with cum sum
  temp.cumsum <- rep(0, length(lab.order))
  temp.cumsum[lab.order==0] <- 1:sum(lab.order==0)
  temp.cumsum[lab.order==1] <- which(lab.order==1, arr.ind=T)-1:sum(lab.order==1)
  TP <- sapply(indexesOfInt, function(x) x-temp.cumsum[x])
  FP <- sapply(indexesOfInt, function(x) temp.cumsum[x])

  TN <- N-FP
  FN <- P-TP

  positiveCount <- TP+FP
  negativeCount <- TN+FN
  trueCount <- TP+FN
  falseCount <- TN + FP
  truePositiveCount <- TP
  trueNegativeCount <- TN
  falsePositiveCount <- FP
  falseNegativeCount <- FN

  f1Score <- f1Score(TP,TN,FN,FP)
  accuracy <- accuracy(TP,TN,FN,FP)
  sensitivity <- sensitivity(TP,TN,FN,FP)
  falseNegativeRate <- falseNegativeRate(TP,TN,FN,FP)
  falsePositiveRate <- falsePositiveRate(TP,TN,FN,FP)
  specificity <-  specificity(TP,TN,FN,FP)
  positivePredictiveValue <- positivePredictiveValue(TP,TN,FN,FP)
  falseDiscoveryRate <- falseDiscoveryRate(TP,TN,FN,FP)
  negativePredictiveValue <- negativePredictiveValue(TP,TN,FN,FP)
  falseOmissionRate <- falseOmissionRate(TP,TN,FN,FP)
  positiveLikelihoodRatio <- positiveLikelihoodRatio(TP,TN,FN,FP)
  negativeLikelihoodRatio <- negativeLikelihoodRatio(TP,TN,FN,FP)
  diagnosticOddsRatio <- diagnosticOddsRatio(TP,TN,FN,FP)

  return(data.frame(predictionThreshold=predictionThreshold,
                    preferenceThreshold=preferenceThreshold,
                    positiveCount=positiveCount,
                    negativeCount=negativeCount,
                    trueCount=trueCount, falseCount=falseCount,
                    truePositiveCount=truePositiveCount,
                    trueNegativeCount=trueNegativeCount,
                    falsePositiveCount=falsePositiveCount,
                    falseNegativeCount=falseNegativeCount,
                    f1Score=f1Score, accuracy=accuracy,
                    sensitivity=sensitivity,
                    falseNegativeRate=falseNegativeRate,
                    falsePositiveRate=falsePositiveRate,
                    specificity=specificity,
                    positivePredictiveValue=positivePredictiveValue,
                    falseDiscoveryRate=falseDiscoveryRate,
                    negativePredictiveValue=negativePredictiveValue,
                    falseOmissionRate=falseOmissionRate,
                    positiveLikelihoodRatio=positiveLikelihoodRatio,
                    negativeLikelihoodRatio=negativeLikelihoodRatio,
                    diagnosticOddsRatio=diagnosticOddsRatio
  ))

}




#' Calculates the prediction distribution
#'
#' @details
#' Calculates the quantiles from a predition object
#'
#' @param prediction            A prediction object as generated using the
#'                              \code{\link{predictProbabilities}} function.
#'
#' @return
#' The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction,
#' the mean and standard deviation per class
#'
#' @export
getPredictionDistribution <- function(prediction){

  # PersonCount - count the number of persons in the class
  predictionDistribution <- stats::aggregate(prediction$value, list(prediction$outcomeCount),
                                             length)
  colnames(predictionDistribution) <- c('class', 'PersonCount')

  # averagePredictedProbability	StDevPredictedProbability
  averagePredictedProbability <- stats::aggregate(prediction$value, list(prediction$outcomeCount),
                                                  mean)
  colnames(averagePredictedProbability) <- c('class', 'averagePredictedProbability')
  StDevPredictedProbability <- stats::aggregate(prediction$value, list(prediction$outcomeCount),
                                                sd)
  colnames(StDevPredictedProbability) <- c('class', 'StDevPredictedProbability')

  predictionDistribution <- merge(predictionDistribution,averagePredictedProbability )
  predictionDistribution <- merge(predictionDistribution,StDevPredictedProbability )

  quantiles <- stats::aggregate(prediction$value, list(prediction$outcomeCount),
                                function(x) stats::quantile(x, probs = c(0.00,0.05, 0.25, 0.5, 0.75,0.95, 1.00))
  )
  quantiles <- as.matrix(quantiles)
  colnames(quantiles) <- c('class', 'MinPredictedProbability', 'P05PredictedProbability',
                           'P25PredictedProbability', 'MedianPredictedProbability',
                           'P75PredictedProbability', 'P95PredictedProbability',
                           'MaxPredictedProbability')
  predictionDistribution <- merge(predictionDistribution,quantiles )

  return(predictionDistribution)
}

getDemographicSummary <- function(prediction, plpData){
  

  demographicData <- plpData$cohorts %>% dplyr::mutate(ageId = floor(ageYear/5),
                                    ageGroup = paste0('Age group: ', floor(ageYear/5)*5, '-',floor(ageYear/5)*5+4),
                                    genId = gender,
                                    genGroup = ifelse(gender==8507, 'Male', 'Female')) %>%
    dplyr::select(rowId,ageId,ageGroup,genId,genGroup ) %>%
    dplyr::inner_join(prediction[,c('rowId', 'value','outcomeCount')], by='rowId') %>%
    dplyr::group_by(ageGroup,genGroup)  %>%
    dplyr::summarise(PersonCountAtRisk = length(outcomeCount), 
                     PersonCountWithOutcome = sum(outcomeCount),
                     averagePredictedProbability = mean(value, na.rm = T),
                     StDevPredictedProbability = sd(value, na.rm = T),
                     MinPredictedProbability =stats::quantile(value, probs = 0),
                     P25PredictedProbability =stats::quantile(value, probs = 0.25),
                     P50PredictedProbability =stats::quantile(value, probs = 0.50),
                     P75PredictedProbability =stats::quantile(value, probs = 0.75),
                     MaxPredictedProbability =stats::quantile(value, probs = 1),
                     )
  
  demographicData <- as.data.frame(demographicData)
  
  return(demographicData)
  
}



# making all this single for easy unit testing
#' Calculate the f1Score
#'
#' @details
#' Calculate the f1Score
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' f1Score value
#'
#' @export
f1Score <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  return(2*(TP/(TP+FP))*(TP/(TP+FN))/((TP/(TP+FP))+(TP/(TP+FN))))
         }
#' Calculate the accuracy
#'
#' @details
#' Calculate the accuracy
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' accuracy value
#'
#' @export
accuracy <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  (TP+TN)/(TP+TN+FP+FN)}
#' Calculate the sensitivity
#'
#' @details
#' Calculate the sensitivity
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' sensitivity value
#'
#' @export
sensitivity <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  TP/(TP+FN)}
#' Calculate the falseNegativeRate
#'
#' @details
#' Calculate the falseNegativeRate
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' falseNegativeRate  value
#'
#' @export
falseNegativeRate <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  FN/(TP+FN)}
#' Calculate the falsePositiveRate
#'
#' @details
#' Calculate the falsePositiveRate
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' falsePositiveRate  value
#'
#' @export
falsePositiveRate <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  FP/(FP+TN)}
#' Calculate the specificity
#'
#' @details
#' Calculate the specificity
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' specificity value
#'
#' @export
specificity <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  TN/(FP+TN)}
#' Calculate the positivePredictiveValue
#'
#' @details
#' Calculate the positivePredictiveValue
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' positivePredictiveValue value
#'
#' @export
positivePredictiveValue <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  TP/(TP+FP)}
#' Calculate the falseDiscoveryRate
#'
#' @details
#' Calculate the falseDiscoveryRate
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' falseDiscoveryRate value
#'
#' @export
falseDiscoveryRate <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  FP/(TP+FP)}
#' Calculate the negativePredictiveValue
#'
#' @details
#' Calculate the negativePredictiveValue
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' negativePredictiveValue value
#'
#' @export
negativePredictiveValue <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  TN/(FN+TN)}
#' Calculate the falseOmissionRate
#'
#' @details
#' Calculate the falseOmissionRate
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' falseOmissionRate value
#'
#' @export
falseOmissionRate <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  FN/(FN+TN)}

#' Calculate the positiveLikelihoodRatio
#'
#' @details
#' Calculate the positiveLikelihoodRatio
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' positiveLikelihoodRatio value
#'
#' @export
positiveLikelihoodRatio <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  (TP/(TP+FN))/(FP/(FP+TN))}

#' Calculate the negativeLikelihoodRatio
#'
#' @details
#' Calculate the negativeLikelihoodRatio
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' negativeLikelihoodRatio value
#'
#' @export
negativeLikelihoodRatio <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  (FN/(TP+FN))/(TN/(FP+TN))}


#' Calculate the diagnostic odds ratio
#'
#' @details
#' Calculate the diagnostic odds ratio
#'
#' @param TP                Number of true positives
#' @param TN                Number of true negatives
#' @param FN                Number of false negatives
#' @param FP                Number of false positives
#'
#' @return
#' diagnosticOddsRatio value
#'
#' @export
diagnosticOddsRatio <- function(TP,TN,FN,FP){
  if(sum(TP<0)>0) stop('TP < 0')
  if(sum(FP<0)>0) stop('FP < 0')
  if(sum(TN<0)>0) stop('TN < 0')
  if(sum(FN<0)>0) stop('FN < 0')
  if(class(TP)!='numeric') stop('Incorrect TP class')
  if(class(FP)!='numeric') stop('Incorrect FP class')
  if(class(TN)!='numeric') stop('Incorrect TN class')
  if(class(FN)!='numeric') stop('Incorrect FN class')
  ((TP/(TP+FN))/(FP/(FP+TN)))/((FN/(TP+FN))/(TN/(FP+TN)))}
hxia/plp-git-demo documentation built on March 19, 2021, 1:54 a.m.