R/predieval.R

Defines functions predieval

Documented in predieval

#' Calculating measures for calibration for benefit for a prediction model
#'
#' This function calculates a series of measures to assess decision accuracy, discrimination for benefit, and calibration for benefit of a prediction model.
#' @param repeats The number of repetitions for the algorithm.
#' @param Ngroups The number of groups to split the data.
#' @param X A dataframe with patient covariates.
#' @param Y The observed outcome. For binary outcomes this should be 0 or 1
#' @param treat A vector with the treatment assignment. This must be 0 (for control treatment)
#' or 1 (for active treatment).
#' @param predicted.treat.0 A vector with the model predictions for each patient, under the control treatment.
#' For the case of a binary outcome this should be probabilities of an event.
#' @param predicted.treat.1 A vector with the model predictions for each patient, under the active treatment.
#' For the case of a binary outcome this should be probabilities of an event.
#' @param type The type of the outcome, "binary" or "continuous".
#' @param bootstraps The number of bootstrap samples to be used for calculating confidence intervals.
#' @param Threshold Threshold for treatment benefit
#' @return A table with all estimated measures of performance.
#' @importFrom Matching Match
#' @importFrom Hmisc rcorr.cens
#' @importFrom stats coef confint binomial complete.cases complete.cases glm lm median predict quantile sd var kmeans
#' @examples
#'  # continuous outcome
#'  dat0=simcont(500)$dat
#'  head(dat0)
#'  # Randomly shuffle the data
#'  dat<-dat0[sample(nrow(dat0)),]
#'  # Create random folds
#'  dat$folds <- cut(seq(1,nrow(dat)),breaks=10,labels=FALSE)
#'
#'  # Obtain out-of-sample predictions
#'  dat.out.CV<-list()
#'  for (i in 1:10){
#'    dat.in.CV=dat[dat$folds!=i,]
#'    dat.out.CV[[i]]=dat[dat$folds==i,]
#'    dat1<-dat.out.CV[[i]]; dat1$t=1
#'    dat0<-dat.out.CV[[i]]; dat0$t=0
#'    m1=lm(data=dat.in.CV, y.observed~x1*t+x2*t)
#'    dat.out.CV[[i]]$predict.treat.1=predict(newdata=dat1, m1)# predictions in treatment
#'    dat.out.CV[[i]]$predict.treat.0=predict(newdata=dat0, m1)# predicions in control
#'  }
#'
#'  dat.CV=dat.out.CV[[1]]
#'  for (i in 2:10){  dat.CV=rbind(dat.CV,dat.out.CV[[i]])}
#'
#'  # assess model performance
#'  predieval(repeats=20, Ngroups=c(5:10),
#'              X=dat.CV[,c("x1", "x2","x3")],
#'              Y=dat.CV$y.observed,
#'              predicted.treat.1 = dat.CV$predict.treat.1,
#'              predicted.treat.0 = dat.CV$predict.treat.0,
#'              treat=dat.CV$t, type="continuous")
#'
#'
#'  # binary outcome
#'  dat0=simbinary(500)$dat
#'  head(dat0)
#'
#'  # Randomly shuffle the data
#'  dat<-dat0[sample(nrow(dat0)),]
#'  # Create random folds
#'  dat$folds <- cut(seq(1,nrow(dat)),breaks=10,labels=FALSE)
#'
#'  dat.out.CV<-list()
#'  for (i in 1:10){
#'    dat.in.CV=dat[dat$folds!=i,]
#'    dat.out.CV[[i]]=dat[dat$folds==i,]
#'    dat1<-dat.out.CV[[i]]; dat1$t=1
#'    dat0<-dat.out.CV[[i]]; dat0$t=0
#'    glm1=glm(y.observed~(x1+x2+x3)*t, data=dat.in.CV, family = binomial(link = "logit"))
#'    dat.out.CV[[i]]$predict.treat.1=predict(newdata=dat1, glm1)# predictions in treatment
#'    dat.out.CV[[i]]$predict.treat.0=predict(newdata=dat0, glm1)# predicions in control
#'  }
#'
#'  dat.CV=dat.out.CV[[1]]
#'  for (i in 2:10){  dat.CV=rbind(dat.CV,dat.out.CV[[i]])}
#'
#'
#'  predieval(repeats=20, Ngroups=c(5:10), X=dat.CV[,c("x1", "x2","x3")],
#'              Y=dat.CV$y.observed,
#'              predicted.treat.1 = expit(dat.CV$predict.treat.1),
#'              predicted.treat.0 = expit(dat.CV$predict.treat.0),
#'              treat=dat.CV$t, type="binary",bootstraps = 50)

#'
#' @export
predieval<-function(repeats=50,
                     Ngroups=10,
                     X,
                     treat,
                     Y,
                     predicted.treat.1,

                     predicted.treat.0,
                     type="continuous",
                     bootstraps=500, Threshold=0
){
  Ngroups.length = length(Ngroups)
  if (type == "continuous") {
    cat(" Type of outcome: continuous", "\n", "Repeats: ",
        repeats, "\n")
    dat1 <- data.frame(cbind(X, Y = c(Y), t = treat, benefit = predicted.treat.1 -
                               predicted.treat.0))

    # PB
    g11 <- dat1[dat1$benefit > Threshold & dat1$t == 1, ]
    n11 <- length(g11$benefit)
    g12 <- dat1[dat1$benefit > Threshold & dat1$t == 0, ]
    n12 <- length(g12$benefit)
    g13 <- dat1[dat1$benefit < Threshold & dat1$t == 1, ]
    n13 <- length(g13$benefit)
    g14 <- dat1[dat1$benefit < Threshold & dat1$t == 0, ]
    n14 <- length(g14$benefit)


    if (n11<5|n12<5|n13<5|n14<5){
      stop(call.=F, paste("The chosen value for benefit threshold, (Threshold=",Threshold,
                          "), is either too large or too small compared to the predicted benefit. Population Benefit cannot be estimated. Please change the benefit threshold.",sep=""))

    }
    dat1$agree1 <- (sign(dat1$benefit-Threshold) == sign(2 * dat1$t -
                                                           1))
    dat.disc <- cbind(Y = dat1$Y, agree1 = dat1$agree1,
                      X)
    s1 <- summary(lm(Y ~ ., data = dat.disc))$coef[2, ]
    discr.1 <- (paste(round(s1[1], digits = 2), " [", round(s1[1] -
                                                              1.96 * s1[2], digits = 2), "; ", round(s1[1] + 1.96 *
                                                                                                       s1[2], digits = 2), "]", sep = ""))
    PB0 <- sum(g11$Y)/(n11 + n14) - sum(g12$Y)/(n12 + n14) +
      sum(g14$Y) * (1/(n11 + n14) - 1/(n12 + n14))
    var.PB0 <- n11 * var(g11$Y)/(n11 + n14)^2 + n12 * var(g12$Y)/(n12 +
                                                                    n14)^2 + n14 * var(g14$Y) * (1/(n11 + n14) - 1/(n12 +
                                                                                                                      n14))^2
    discr.2 <- paste(round(PB0, digits = 2), " [", round(PB0 -
                                                           1.96 * sqrt(var.PB0), digits = 2), "; ", round(PB0 +
                                                                                                            1.96 * sqrt(var.PB0), digits = 2), "]", sep = "")
    PB1 <- sum(g14$Y)/(n11 + n14) - sum(g13$Y)/(n11 + n13) +
      sum(g11$Y) * (1/(n11 + n14) - 1/(n11 + n13))
    var.PB1 <- n14 * var(g14$Y)/(n11 + n14)^2 + n13 * var(g13$Y)/(n11 +
                                                                    n13)^2 + n11 * var(g11$Y) * (1/(n11 + n14) - 1/(n11 +
                                                                                                                      n13))^2
    discr.3 <- paste(round(PB1, digits = 2), " [", round(PB1 -
                                                           1.96 * sqrt(var.PB1), digits = 2), "; ", round(PB1 +
                                                                                                            1.96 * sqrt(var.PB1), digits = 2), "]", sep = "")


    mean.bias <- mean(dat1$Y[dat1$t == 1]) - mean(dat1$Y[dat1$t ==
                                                           0]) - mean(dat1$benefit)
    rmse.reg <- c()
    a0.reg <- c()
    a1.reg <- c()
    r2.reg <- c()
    for (group in 1:Ngroups.length) {
      quantiles1 <- quantile(dat1$benefit, probs <- seq(0,
                                                        1, 1/Ngroups[[group]]))
      g1 <- list()
      predicted.reg <- c()
      observed.reg <- c()
      for (i in 1:Ngroups[[group]]) {
        g1[[i]] <- dat1[dat1$benefit >= quantiles1[i] &
                          dat1$benefit < quantiles1[i + 1], ]
        predicted.reg <- c(predicted.reg, mean(g1[[i]]$benefit))
        observed.reg <- c(observed.reg, mean(g1[[i]]$Y[g1[[i]]$t ==
                                                         1]) - mean(g1[[i]]$Y[g1[[i]]$t == 0]))
      }
      compare <- data.frame(predicted.reg, observed.reg)
      compare <- compare[complete.cases(compare), ]
      rmse.reg <- c(rmse.reg, sqrt(mean((compare$predicted.reg -
                                           compare$observed.reg)^2)))
      lm1r <- summary(lm(compare$observed.reg ~ compare$predicted.reg))
      a0.reg <- c(a0.reg, coef(lm1r)[1, 1])
      a1.reg <- c(a1.reg, coef(lm1r)[2, 1])
      r2.reg <- c(r2.reg, lm1r$r.squared)
    }
    results.regr = data.frame(Method = c(paste("group by benefit N=",
                                               Ngroups[1], sep = "")), RMSE = rmse.reg[1], a0 = a0.reg[1],
                              a1 = a1.reg[1], R2 = r2.reg[1])
    if (Ngroups.length > 1) {
      for (n in 2:Ngroups.length) {
        d1 <- data.frame(Method = c(paste("group by benefit N=",
                                          Ngroups[n], sep = "")), RMSE = rmse.reg[n],
                         a0 = a0.reg[n], a1 = a1.reg[n], R2 = r2.reg[n])
        results.regr = rbind(results.regr, d1)
      }
    }
    rmse.m1 <- c()
    coeff.m11 <- c()
    coeff.m12 <- c()
    R.2.m1 <- c()
    results.kmeans <- list()
    percent1.s2 <- list()
    for (n in 1:Ngroups.length) {
      percent.s2 <- c()
      for (k in 1:repeats) {
        groups.kmean <- kmeans(x = dat1[, colnames(X)],
                               centers = Ngroups[n], iter.max = 50)
        dat1$group <- groups.kmean$cluster
        ngroups <- as.vector(table(dat1$group))
        observed.groups1 <- c()
        observed.groups0 <- c()
        estimated.groups.m1 <- c()
        dall <- NULL
        for (i in 1:Ngroups[n]) {
          observed.groups1 <- c(observed.groups1, (mean(dat1$Y[dat1$group ==
                                                                 i & dat1$t == 1])))
          observed.groups0 <- c(observed.groups0, (mean(dat1$Y[dat1$group ==
                                                                 i & dat1$t == 0])))
          observed.sign <- (observed.groups1 - observed.groups0) >
            0
          estimated.groups.m1 <- c(estimated.groups.m1,
                                   mean(dat1$benefit[dat1$group == i]))
          estimated.sign <- (estimated.groups.m1 > 0)
        }
        dall1 <- (cbind(observed.sign, estimated.sign))
        dall1 <- dall1[complete.cases(dall1), ]
        percent.s2 <- c(percent.s2, sum(dall1[, 1] ==
                                          dall1[, 2])/length(dall1[, 1]))
        dall <- data.frame(observed.groups0, observed.groups1,
                           estimated.groups.m1, ngroups)
        dall <- dall[complete.cases(dall), ]
        dall$observed.groups = dall$observed.groups1 -
          dall$observed.groups0
        reg.m1 <- summary(lm(dall$observed.groups ~
                               dall$estimated.groups.m1, weights = dall$ngroups))
        coeff.m11 <- c(coeff.m11, coef(reg.m1)[1])
        coeff.m12 <- c(coeff.m12, coef(reg.m1)[2])
        R.2.m1 <- c(R.2.m1, reg.m1$r.squared)
        rmse.m1 <- c(rmse.m1, sqrt(mean((dall$observed.groups -
                                           dall$estimated.groups.m1)^2)))
      }
      percent1.s2[[n]] <- round(mean(percent.s2), digits = 2)
      results.kmeans[[n]] <- data.frame(rmse = round(median(rmse.m1),
                                                     digits = 2), a0 = round(median(coeff.m11), digits = 2),
                                        a1 = round(median(coeff.m12), digits = 2), R2 = round(median(R.2.m1),
                                                                                              digits = 2))
    }
    rmse.m1 <- c()
    m1.a0 <- c()
    m1.a1 <- c()
    m1.R2 <- c()
    percentX.s22 <- c()
    for (k in 1:repeats) {
      g1 <- dat1[dat1$t == 1, ]
      g0 <- dat1[dat1$t == 0, ]
      n1 <- min(length(g1$t), length(g0$t))
      g1 <- g1[sample(length(g1$t), n1), ]
      g0 <- g0[sample(length(g0$t), n1), ]
      dataall <- rbind(g1, g0)
      covariates <- dataall[, colnames(X)]
      matched <- Match(Tr = dataall$t, X = covariates)
      data.compare <- data.frame(tr.obs = dataall$Y[matched$index.treated],
                                 ctr.obs = dataall$Y[matched$index.control],
                                 benefit.m1 = (dataall$benefit[matched$index.treated] +
                                                 dataall$benefit[matched$index.control])/2)
      data.compare$obs.benefit <- with(data.compare, tr.obs -
                                         ctr.obs)
      percentX.s22 <- c(percentX.s22, mean(sign(data.compare$obs.benefit-Threshold) ==
                                             sign(data.compare$benefit.m1-Threshold)))
      rmse.m1 <- c(rmse.m1, sqrt(mean((data.compare$obs.benefit -
                                         data.compare$benefit.m1)^2)))
      reg.m1 <- lm(data.compare$obs.benefit ~ data.compare$benefit.m1)
      m1.a0 <- c(m1.a0, coef(reg.m1)[1])
      m1.a1 = c(m1.a1, coef(reg.m1)[2])
      m1.R2 = c(m1.R2, summary(reg.m1)$r.squared)
    }
    results.after.matching <- data.frame(rmse = round(median(rmse.m1),
                                                      digits = 2), a0 = round(median(m1.a0), digits = 2),
                                         a1 = round(median(m1.a1), digits = 2), R2 = round(median(m1.R2),
                                                                                           digits = 2))
    dataall <- g0 <- g1 <- n1 <- covariates <- data.compare <- matched <- reg.m1 <- NULL
    rmse.m1 <- m1.a0 <- m1.a1 <- m1.R2 <- c()
    percentB.s2 = c()
    for (k in 1:repeats) {
      g1 <- dat1[dat1$t == 1, ]
      g0 <- dat1[dat1$t == 0, ]
      n1 <- min(length(g1$t), length(g0$t))
      g1 <- g1[sample(length(g1$t), n1), ]
      g0 <- g0[sample(length(g0$t), n1), ]
      dataall <- rbind(g1, g0)
      matched1 <- Match(Tr = dataall$t, X = dataall$benefit)
      data.compare <- data.frame(tr.obs = dataall$Y[matched1$index.treated],
                                 ctr.obs = dataall$Y[matched1$index.control],
                                 benefit.m1 = (dataall$benefit[matched1$index.treated] +
                                                 dataall$benefit[matched1$index.control])/2)
      data.compare$obs.benefit <- with(data.compare, tr.obs -
                                         ctr.obs)
      percentB.s2 <- c(percentB.s2, mean(sign(data.compare$obs.benefit-Threshold) ==
                                           sign(data.compare$benefit.m1-Threshold)))
      rmse.m1 <- c(rmse.m1, sqrt(mean((data.compare$obs.benefit -
                                         data.compare$benefit.m1)^2)))
      reg.m1 <- lm(data.compare$obs.benefit ~ data.compare$benefit.m1)
      m1.a0 <- c(m1.a0, coef(reg.m1)[1])
      m1.a1 = c(m1.a1, coef(reg.m1)[2])
      m1.R2 = c(m1.R2, summary(reg.m1)$r.squared)
    }
    results.d <- data.frame(`Estimand, estimation method` = c("Population-level benefit (PB), adjusted",
                                                              "Population-level benefit vs t=0 (PB0)", "Population-level benefit vs t=1 (PB1)",
                                                              paste("Benefit accuracy (BA), k-means N=", Ngroups[1],
                                                                    sep = "")), Estimate = c(discr.1, discr.2, discr.3,
                                                                                             round(percent1.s2[[1]], digits = 2)), check.names = F)
    results.after.matching.benefit = data.frame(rmse = round(median(rmse.m1),
                                                             digits = 2), a0 = round(median(m1.a0), digits = 2),
                                                a1 = round(median(m1.a1), digits = 2), R2 = round(median(m1.R2),
                                                                                                  digits = 2))
    results <- data.frame(Method = c(paste("kmeans N=",
                                           Ngroups[1], sep = "")), RMSE = c(results.kmeans[[1]]$rmse),
                          a0 = c(results.kmeans[[1]]$a0), a1 = c(results.kmeans[[1]]$a1),
                          R2 = c(results.kmeans[[1]]$R2))
    if (Ngroups.length > 1) {
      for (n in 2:Ngroups.length) {
        d1 <- data.frame(Method = c(paste("kmeans N=",
                                          Ngroups[n], sep = "")), RMSE = c(results.kmeans[[n]]$rmse),
                         a0 = c(results.kmeans[[n]]$a0), a1 = c(results.kmeans[[n]]$a1),
                         R2 = c(results.kmeans[[n]]$R2))
        results = rbind(results, d1)
        d2 <- data.frame(`Estimand, estimation method` = c(paste("Benefit accuracy (BA), k-means N=",
                                                                 Ngroups[n], sep = "")), Estimate = as.character(c(round(percent1.s2[[n]],
                                                                                                                         digits = 2))), check.names = F)
        results.d = rbind(results.d, d2)
      }
    }
    results.d <- rbind(results.d, data.frame(`Estimand, estimation method` = "Benefit accuracy (BA), match by covariates",
                                             Estimate = as.character(round(mean(percentX.s22),
                                                                           digits = 2)), check.names = F))
    results.d <- rbind(results.d, data.frame(`Estimand, estimation method` = "Benefit accuracy (BA), match by benefit",
                                             Estimate = as.character(round(mean(percentB.s2),
                                                                           digits = 2)), check.names = F))
    results <- rbind(results.regr, results)
    results <- rbind(results, data.frame(Method = c("match by covariates",
                                                    "match by benefit"), RMSE = c(results.after.matching$rmse,
                                                                                  results.after.matching.benefit$rmse), a0 = c(results.after.matching$a0,
                                                                                                                               results.after.matching.benefit$a0), a1 = c(results.after.matching$a1,
                                                                                                                                                                          results.after.matching.benefit$a1), R2 = c(results.after.matching$R2,
                                                                                                                                                                                                                     results.after.matching.benefit$R2)))
    dat1$predicted.treat.0 <- predicted.treat.0
    lm1 <- lm(dat1$Y ~ dat1$predicted.treat.0 + dat1$t:dat1$benefit)
    sum.lm1 <- summary(lm1)
    ci1 <- confint(lm1)
    c2 <- paste(round(summary(lm1)$coef[3], digits = 2),
                " [", round(ci1[3, 1], digits = 2), "; ", round(ci1[3,
                                                                    2], digits = 2), "]", sep = "")
    results.reg1 <- data.frame(`Calibration slope` = c2)
    rownames(results.reg1) <- c("Estimate")
    results[2:5] <- round(results[2:5], digits = 3)
    mean.bias <- round(mean.bias, 2)
    results.all <- list(outcome.type = type, decision.accuracy = results.d,
                        calibration.for.benefit = results, regression.for.benefit = results.reg1,
                        mean.bias = mean.bias)
  }
  if (type == "binary") {

    if (min(c(predicted.treat.1,predicted.treat.0))<0 | max(c(predicted.treat.1,predicted.treat.0))>1){
      stop(call.=F, paste("The provided probabilities lie outside the [0,1] range."))
    }


    cat(" Type of outcome: binary", "\n", "Repeats: ", repeats,
        "\n", "Number of bootstraps for caclulating confidence intervals: ",
        bootstraps, "\n")
    dat1 <- cbind(X, Y = Y, t = treat, predicted.treat.1 = predicted.treat.1,
                  predicted.treat.0 = predicted.treat.0, benefit = predicted.treat.1 -
                    predicted.treat.0)
    g11 = dat1[dat1$benefit > Threshold & dat1$t == 1, ]
    n11 = length(g11$benefit)
    g12 = dat1[dat1$benefit > Threshold & dat1$t == 0, ]
    n12 = length(g12$benefit)
    g13 = dat1[dat1$benefit < Threshold & dat1$t == 1, ]
    n13 = length(g13$benefit)
    g14 = dat1[dat1$benefit < Threshold & dat1$t == 0, ]
    n14 = length(g14$benefit)

    if (n11<5|n12<5|n13<5|n14<5){
      stop(call.=F, paste("The chosen value for benefit threshold, (Threshold=",Threshold,
                          "), is either too large or too small compared to the predicted benefit. Population Benefit cannot be estimated. Please change the benefit threshold.",sep=""))

    }


    dat1$agree = 1 * (sign(dat1$benefit-Threshold) == sign(2 * dat1$t -
                                                             1))
    dat.disc = cbind(Y = dat1$Y, agree = dat1$agree, X)
    magree = glm(Y ~ ., data = dat.disc, family = "binomial")
    d.test0 = dat.disc
    d.test0$agree = 0
    p0 = predict(magree, newdata = d.test0, type = "response")
    d.test1 = dat.disc
    d.test1$agree = 1
    p1 = predict(magree, newdata = d.test1, type = "response")
    S1mean = mean(p1) - mean(p0)
    bootdif <- function(dd) {
      boot = sample(nrow(dd), replace = T)
      db = dd[boot, ]
      glmfit <- glm(Y ~ ., data = db, family = "binomial")
      newdata <- db
      newdata$agree = 0
      p0 <- mean(predict(glmfit, newdata, type = "response"))
      newdata <- db
      newdata$agree = 1
      p1 <- mean(predict(glmfit, newdata, type = "response"))
      return(p1 - p0)
    }
    bootest <- unlist(lapply(1:bootstraps, function(x) bootdif(dat.disc)))
    da.1 = paste(round(S1mean, digits = 3), " [", round(quantile(bootest,
                                                                 c(0.025, 0.975))[1], digits = 3), "; ", round(quantile(bootest,
                                                                                                                        c(0.025, 0.975))[2], digits = 3), "]", sep = "")
    PB0 = sum(g11$Y)/(n11 + n14) - sum(g12$Y)/(n12 + n14) +
      sum(g14$Y) * (1/(n11 + n14) - 1/(n12 + n14))
    var.PB0 = n11 * var(g11$Y)/(n11 + n14)^2 + n12 * var(g12$Y)/(n12 +
                                                                   n14)^2 + n14 * var(g14$Y) * (1/(n11 + n14) - 1/(n12 +
                                                                                                                     n14))^2
    da.2 = paste(round(PB0, digits = 3), " [", round(PB0 -
                                                       1.96 * sqrt(var.PB0), digits = 3), "; ", round(PB0 +
                                                                                                        1.96 * sqrt(var.PB0), digits = 3), "]", sep = "")
    PB1 = sum(g14$Y)/(n11 + n14) - sum(g13$Y)/(n11 + n13) +
      sum(g11$Y) * (1/(n11 + n14) - 1/(n11 + n13))
    var.PB1 = n14 * var(g14$Y)/(n11 + n14)^2 + n13 * var(g13$Y)/(n11 +
                                                                   n13)^2 + n11 * var(g11$Y) * (1/(n11 + n14) - 1/(n11 +
                                                                                                                     n13))^2
    da.3 = paste(round(PB1, digits = 3), " [", round(PB1 -
                                                       1.96 * sqrt(var.PB1), digits = 3), "; ", round(PB1 +
                                                                                                        1.96 * sqrt(var.PB1), digits = 3), "]", sep = "")
    percent.1on1.ben = c()
    c.by.benefit = c()
    se.c.by.benefit = c()
    c.by.covariates = c()
    se.c.by.covariates = c()
    n.0 <- sum(dat1$t == 0)
    n.1 <- sum(dat1$t == 1)
    n.10 = min(n.0, n.1)
    for (i in 1:repeats) {
      rows.in <- c(which(dat1$t == 1)[sample(n.1, n.10)],
                   which(dat1$t == 0)[sample(n.0, n.10)])
      dat2 = dat1[rows.in[order(rows.in)], ]
      X2 = X[rows.in[order(rows.in)], ]
      ind.0 <- which(dat2$t == 0)
      order.0 <- order(dat2$benefit[ind.0])
      ind.0 <- ind.0[order.0]
      ind.1 <- which(dat2$t == 1)
      order.1 <- order(dat2$benefit[ind.1])
      ind.1 <- ind.1[order.1]
      pred.ben.0 <- dat2$benefit[ind.0]
      pred.ben.1 <- dat2$benefit[ind.1]
      pred.ben.avg <- (pred.ben.1 + pred.ben.0)/2
      obs.out.0 <- dat2$Y[ind.0]
      obs.out.1 <- dat2$Y[ind.1]
      obs.ben <- obs.out.1 - obs.out.0
      pred.ben.avg2 = pred.ben.avg[obs.ben != 0]
      obs.ben2 = obs.ben[obs.ben != 0]
      percent.1on1.ben = c(percent.1on1.ben, mean(sign(obs.ben2-Threshold) ==
                                                    sign(pred.ben.avg2-Threshold)))
      cindex <- rcorr.cens(pred.ben.avg, obs.ben)
      c.by.benefit <- c(c.by.benefit, cindex["C Index"][[1]])
      se.c.by.benefit <- c(se.c.by.benefit, cindex["S.D."][[1]]/2)
      percent.1on1.X = c()
      rr1 <- Match(Tr = dat2$t, X = X2, M = 1, ties = F,
                   replace = FALSE)
      ind.0 <- rr1$index.control
      ind.1 <- rr1$index.treated
      pred.ben.0 <- dat2$benefit[ind.0]
      pred.ben.1 <- dat2$benefit[ind.1]
      pred.ben.avg <- (pred.ben.1 + pred.ben.0)/2
      obs.out.0 <- dat2$Y[ind.0]
      obs.out.1 <- dat2$Y[ind.1]
      obs.ben <- obs.out.1 - obs.out.0
      pred.ben.avg2 = pred.ben.avg[obs.ben != 0]
      obs.ben2 = obs.ben[obs.ben != 0]
      percent.1on1.X = c(percent.1on1.X, mean(sign(obs.ben2-Threshold) ==
                                                sign(pred.ben.avg2-Threshold)))
      cindex2 <- rcorr.cens(pred.ben.avg, obs.ben)
      c.by.covariates <- c(c.by.covariates, cindex2["C Index"][[1]])
      se.c.by.covariates <- c(se.c.by.covariates, cindex2["S.D."][[1]]/2)
    }
    mean.percent.1on1.ben = mean(percent.1on1.ben)
    mean.percent.1on1.X = mean(percent.1on1.X)
    results.c.f.b = c(paste(round(mean(c.by.covariates),
                                  digits = 2), "[", round(mean(c.by.covariates) -
                                                            1.96 * mean(se.c.by.covariates), digits = 2), "; ",
                            round(mean(c.by.covariates) + 1.96 * mean(se.c.by.covariates),
                                  digits = 2), "]", sep = ""), paste(round(mean(c.by.benefit),
                                                                           digits = 2), "[", round(mean(c.by.benefit) - 1.96 *
                                                                                                     mean(se.c.by.benefit), digits = 2), "; ", round(mean(c.by.benefit) +
                                                                                                                                                       1.96 * mean(se.c.by.benefit), digits = 2), "]",
                                                                     sep = ""))
    ben.observed = mean(dat1$Y[dat1$t == 1]) - mean(dat1$Y[dat1$t ==
                                                             0])
    ben.model = mean((dat1$predicted.treat.1)) - mean((dat1$predicted.treat.0))
    mean.bias = ben.observed - ben.model
    rmse.reg <- c()
    a0.reg <- c()
    a1.reg <- c()
    r2.reg <- c()
    for (group in 1:Ngroups.length) {
      quantiles1 <- quantile(dat1$benefit, probs <- seq(0,
                                                        1, 1/Ngroups[[group]]))
      g1 <- list()
      predicted.reg <- c()
      observed.reg <- c()
      for (i in 1:Ngroups[[group]]) {
        g1[[i]] <- dat1[dat1$benefit >= quantiles1[i] &
                          dat1$benefit < quantiles1[i + 1], ]
        predicted.reg <- c(predicted.reg, mean(g1[[i]]$benefit))
        observed.reg <- c(observed.reg, mean(g1[[i]]$Y[g1[[i]]$t ==
                                                         1]) - mean(g1[[i]]$Y[g1[[i]]$t == 0]))
      }
      compare = data.frame(predicted.reg, observed.reg)
      compare = compare[complete.cases(compare), ]
      rmse.reg <- c(rmse.reg, sqrt(mean((compare$predicted.reg -
                                           compare$observed.reg)^2)))
      lm1r <- summary(lm(compare$observed.reg ~ compare$predicted.reg))
      a0.reg <- c(a0.reg, coef(lm1r)[1, 1])
      a1.reg <- c(a1.reg, coef(lm1r)[2, 1])
      r2.reg <- c(r2.reg, lm1r$r.squared)
    }
    results.regr = data.frame(Method = c(paste("group by benefit N=",
                                               Ngroups[1], sep = "")), RMSE = rmse.reg[1], a0 = a0.reg[1],
                              a1 = a1.reg[1], R2 = r2.reg[1])
    if (Ngroups.length > 1) {
      for (n in 2:Ngroups.length) {
        d1 <- data.frame(Method = c(paste("group by benefit N=",
                                          Ngroups[n], sep = "")), RMSE = rmse.reg[n],
                         a0 = a0.reg[n], a1 = a1.reg[n], R2 = r2.reg[n])
        results.regr = rbind(results.regr, d1)
      }
    }
    rmse.m <- coeff.1 <- coeff.2 <- R.2.m <- c()
    results.kmeans.bin <- list()
    percent1.s2 <- list()
    for (n in 1:Ngroups.length) {
      percent.s2 <- c()
      for (k in 1:repeats) {
        groups.kmean <- kmeans(x = dat1[, colnames(X)],
                               centers = Ngroups[n], iter.max = 50)
        dat1$group <- groups.kmean$cluster
        ngroups <- as.vector(table(dat1$group))
        observed.groups1 <- c()
        observed.groups0 <- c()
        estimated.groups <- c()
        dall = NULL
        dall1 = NULL
        for (i in 1:Ngroups[n]) {
          observed.groups1 = c(observed.groups1, (mean(dat1$Y[dat1$group ==
                                                                i & dat1$t == 1])))
          observed.groups0 = c(observed.groups0, (mean(dat1$Y[dat1$group ==
                                                                i & dat1$t == 0])))
          observed.sign = (observed.groups1 - observed.groups0) >
            Threshold
          estimated.groups = c(estimated.groups, mean(dat1$benefit[dat1$group ==
                                                                     i]))
          estimated.sign = (estimated.groups > Threshold)
        }
        dall1 = (cbind(observed.sign, estimated.sign))
        dall1 = dall1[complete.cases(dall1), ]
        percent.s2 = c(percent.s2, sum(dall1[, 1] ==
                                         dall1[, 2])/length(dall1[, 1]))
        dall = data.frame(observed.groups0, observed.groups1,
                          estimated.groups)
        is.na(dall$observed.groups0[is.infinite(dall$observed.groups0)]) = TRUE
        is.na(dall$observed.groups1[is.infinite(dall$observed.groups1)]) = TRUE
        dall = dall[complete.cases(dall), ]
        dall$observed.groups = dall$observed.groups1 -
          dall$observed.groups0
        reg = summary(lm(dall$observed.groups ~ dall$estimated.groups))
        coeff.1 = c(coeff.1, coef(reg)[1])
        coeff.2 = c(coeff.2, coef(reg)[2])
        R.2.m = c(R.2.m, reg$r.squared)
        rmse.m = c(rmse.m, sqrt(mean((dall$observed.groups -
                                        dall$estimated.groups)^2)))
      }
      percent1.s2[[n]] = round(mean(percent.s2), digits = 2)
      results.kmeans.bin[[n]] <- data.frame(rmse = round(median(rmse.m),
                                                         digits = 3), a0 = round(median(coeff.1), digits = 2),
                                            a1 = round(median(coeff.2), digits = 2), R2 = round(median(R.2.m),
                                                                                                digits = 2))
    }
    results = data.frame(Method = c(paste("kmeans N=", Ngroups[1],
                                          sep = "")), RMSE = c(results.kmeans.bin[[1]]$rmse),
                         a0 = c(results.kmeans.bin[[1]]$a0), a1 = c(results.kmeans.bin[[1]]$a1),
                         R2 = c(results.kmeans.bin[[1]]$R2))
    if (Ngroups.length > 1) {
      for (n in 2:Ngroups.length) {
        d1 <- data.frame(Method = c(paste("kmeans N=",
                                          Ngroups[n], sep = "")), RMSE = c(results.kmeans.bin[[n]]$rmse),
                         a0 = c(results.kmeans.bin[[n]]$a0), a1 = c(results.kmeans.bin[[n]]$a1),
                         R2 = c(results.kmeans.bin[[n]]$R2))
        results = rbind(results, d1)
      }
    }
    results = rbind(results.regr, results)
    dat1$log.pred.t1 <- logit(predicted.treat.1)
    dat1$log.pred.t0 <- logit(predicted.treat.0)
    dat1$benefit.lm <- dat1$log.pred.t1 - dat1$log.pred.t0
    glm1 = glm(dat1$Y ~ dat1$log.pred.t0 + dat1$t:dat1$benefit.lm,
               family = binomial)
    ci1 = confint(glm1)
    slope.ben = paste(round(summary(glm1)$coef[3], digits = 2),
                      " [", round(ci1[3, 1], digits = 2), "; ", round(ci1[3,
                                                                          2], digits = 2), "]", sep = "")
    results.reg = data.frame(slope.ben)
    rownames(results.reg) = c("Estimate")
    results.DA = data.frame(`Estimand, estimation method` = c("Population-level benefit (PB), adjusted",
                                                              "Population-level benefit vs t=0 (PB0)", "Population-level benefit vs t=1 (PB1)",
                                                              paste("Benefit accuracy (BA), k-means N=", Ngroups[1],
                                                                    sep = "")), Estimate = c(da.1, da.2, da.3, round(percent1.s2[[1]],
                                                                                                                     digits = 2)), check.names = F)
    if (Ngroups.length > 1) {
      for (n in 2:Ngroups.length) {
        d1 <- data.frame(`Estimand, estimation method` = paste("Benefit accuracy (BA), k-means N=",
                                                               Ngroups[n], sep = ""), Estimate = as.character(round(percent1.s2[[n]],
                                                                                                                    digits = 2)), check.names = F)
        results.DA = rbind(results.DA, d1)
      }
    }
    results.DA = rbind(results.DA, data.frame(`Estimand, estimation method` = c("Benefit accuracy (BA), match by covariates",
                                                                                "Benefit accuracy (BA), match by benefit"), Estimate = c(as.character(round(mean.percent.1on1.X,
                                                                                                                                                            digits = 2)), as.character(round(mean.percent.1on1.ben,
                                                                                                                                                                                             digits = 2))), check.names = F))
    results.discrimination = data.frame(method = c("c-for-benefit, match by covariates",
                                                   "c-for-benefit, match by benefit"), Estimates = c(results.c.f.b[1],
                                                                                                     results.c.f.b[2]))
    results[2:5] = round(results[2:5], digits = 3)
    results.bias = round(mean.bias, digits = 4)
    results.all = list(outcome.type = type, decision.accuracy = results.DA,
                       calibration.via.kmeans = results, mean.bias = results.bias,
                       regression.for.benefit = results.reg, discrimination.for.benefit = results.discrimination)
  }
  return(results.all)


}
esm-ispm-unibe-ch/predieval documentation built on Nov. 15, 2022, 8:12 a.m.