R/reinforced.R

# reinforced risk prediction with budget constraint -----------------------------------------------


#' Model fit for the training set
#'
#' \code{modelFit} outputs the FPCA (functional principal component analysis) decomposition and the parameter estimates
#' of the functional generalized linear model at each time grid.
#'
#' @export
#' @param Y The outcome variable, vector of length \eqn{n}, taking values in \eqn{{1, 0, NA}}, where 1 = disease, 0 = not, NA = missing.
#' @param X Observed longitudinal biomarker, matrix of \eqn{n} by \eqn{nTotal}, where \eqn{nTotal} denotes the total number of time grids.
#' Missing values are denoted by NA.
#' @param Z Other baseline covariates.
#' @param startT Time of the first prediction, denoted by \eqn{t_1} in the manuscript. For instance, if the time grids are \eqn{{0,1/60,2/60,...,1}},
#' then startT = 25 means that the first prediction is made at \eqn{t = 24/60}.
#' @param link The link function used in functional generalized linear models, e.g. "logit", "probit".
#' @param pve Proportion of variance explained in FPCA.
#' @param nbasis Number of B-spline basis functions needed for estimation of the mean function and smoothing of covariance.
#' @param weight Weight for each individual.
#'
#' @return
#' \item{list_fpcaFit}{FPCA decomposition at each time grid from startT to the end.}
#' \item{list_paraEst}{Parameter estimates at each time grid from startT to the end.}
#'
#' @examples
#' library(reinforcedPred)
#'
#' # take the example training data (univariate Z) from the reinforcedPred package
#' # see documentation for details about the data set train_data_uniZ
#' Y <- as.numeric(train_data_uniZ$Y)
#' tildeX.missing <- as.matrix(train_data_uniZ[,2:62])
#' Z <- as.numeric(train_data_uniZ$Z)
#'
#' # analysis starts
#' startT <- 55
#' link <- "probit"
#' weight <- rep(1, length(Y))
#'
#' result <- modelFit(Y, tildeX.missing, Z, startT, link, pve = 0.99, nbasis = 10, weight)
#'
#' # obtained parameter estimates and FPCA decompositions
#' list_paraEst <- result$list_paraEst
#' list_fpcaFit <- result$list_fpcaFit
#'


modelFit <- function(Y, X, Z, startT, link, pve, nbasis, weight) {

  n <- length(Y)                                 # sample size
  nTotal <- dim(X)[2]                            # number of total time grid

  list_fpcaFit <- list()                         # FPCA fit at each time grid
  list_paraEst <- list()                         # parameter estimates at each time grid

  # curT denotes the current time grid #
  for (curT in startT:nTotal) {

    curX <- X[,1:curT]                           # the up-to-date longitudinal data
    fpcaFit <- refund::fpca.sc(Y = curX, nbasis = nbasis, var = TRUE, pve = pve)
    list_fpcaFit <- append(list_fpcaFit, list(fpcaFit))

    # number of FPCs (functional principal components) #
    K <- fpcaFit$npc

    # FPC scores #
    FPC.score <- fpcaFit$scores

    # fit a logistic or probit regression using FPC scores as predictors #
    glmFit <- stats::glm(Y ~ FPC.score + Z, family = stats::binomial(link = link), weights = weight)
    aHat <- as.numeric(stats::coefficients(glmFit))

    Hat.a0 <- aHat[1]
    Hat.a1 <- aHat[2:(2+K-1)]
    Hat.a2 <- aHat[(2+K):length(aHat)]

    list_paraEst <- append(list_paraEst, list(list("a0Hat" = Hat.a0, "a1Hat" = Hat.a1, "a2Hat" = Hat.a2)))
  }

  return (list("list_fpcaFit" = list_fpcaFit, "list_paraEst" = list_paraEst))
}



#' Risk prediction on the test set
#'
#' For a fixed threshold value \eqn{\tau}, \code{modelPredict} predicts the outcome \eqn{Y}  for subjects in the test set.
#' This function also outputs the cost associated with the prediction procedure.
#'
#' @export
#' @param list_fpcaFit Obtained FPCA decomposition from \code{modelFit}.
#' @param list_paraEst Obtained parameter estimates from \code{modelFit}.
#' @param Xtest Longitudinal biomarker data for subjects in the test set, matrix of \eqn{testn} by \eqn{nTotal}. Missing values
#' are denoted by NA.
#' @param Ztest Other baseline covariates for subjects in the test set.
#' @param startT Time of the first prediction, denoted by \eqn{t_1} in the manuscript. For instance, if the time grids are \eqn{{0,1/60,2/60,...,1}},
#' then startT = 25 means that the first prediction is made at \eqn{t = 24/60}.
#' @param tau The threshold value \eqn{\tau}.
#'
#' @return
#' \item{final.label}{Predicted outcome \eqn{Y} for subjects in the test set, vector of length \eqn{testn}.}
#' \item{avg.cost}{Average cost when we applied this prediction procedure to the test set. }
#' \item{cost}{Cost for each subject, vector of length \eqn{testn}. For some subjects, we make a definite decision early. For others, we follow up with a long period of time. Hence the cost is different for each individual. }
#'
#' @examples
#' # see the example from function reinforced.

modelPredict <- function(list_fpcaFit, list_paraEst, Xtest, Ztest, startT, tau) {

  testn <- dim(Xtest)[1]                         # number of subjects in the test data set
  nTotal <- dim(Xtest)[2]                        # number of total time grid

  prediction <- matrix(NA, nrow = testn, ncol = nTotal)

  # curT denotes the current time grid #
  # from starting time grid to time grid (nTotal - 1), we use classification with reject option #
  for (curT in startT:(nTotal - 1)) {

    # the fpcaFit at the current time grid #
    fpcaFit <- list_fpcaFit[[curT - startT + 1]]

    # number of FPCs #
    K <- fpcaFit$npc

    # FPC basis functions #
    FPC.basis <- fpcaFit$efunctions

    # estimated mean function #
    mu <- fpcaFit$mu

    # estimated eigenvalues of the covariance operator, i.e., variances of FPC scores #
    evalues <- fpcaFit$evalues

    # estimated measurement error variance #
    sigma2 <- fpcaFit$sigma2

    curXtest <- Xtest[,1:curT]                   # the up-to-date longitudinal measurement

    # estimated FPC scores for subjects in the test data set #
    estimated.FPC.score <- matrix(NA, nrow = testn, ncol = K)
    for (i in 1:testn) {
      ind <- which(!is.na(curXtest[i, ]))
      temp.matrix <- matrix(FPC.basis[ind, ], ncol = K)

      if (K == 1) {
        estimated.FPC.score[i, ] <- MASS::ginv(t(temp.matrix) %*% temp.matrix + sigma2 * evalues) %*% t(temp.matrix) %*% (Xtest[i, ind] - mu[ind])
      } else {
        estimated.FPC.score[i, ] <- MASS::ginv(t(temp.matrix) %*% temp.matrix + sigma2 * diag(evalues)) %*% t(temp.matrix) %*% (Xtest[i, ind] - mu[ind])
      }

    }

    # the parameter estimates at the current time grid #
    paraFit <- list_paraEst[[curT - startT + 1]]

    Hat.a0 <- paraFit$a0Hat
    Hat.a1 <- paraFit$a1Hat
    Hat.a2 <- paraFit$a2Hat

    # predict the label Y using the linear predictor #
    if (is.vector(Ztest)) {
      f <- Hat.a0 + estimated.FPC.score %*% Hat.a1 + Hat.a2 * Ztest
    } else {
      f <- Hat.a0 + estimated.FPC.score %*% Hat.a1 + Ztest %*% Hat.a2
    }

    # 1 = disease, 0 = not, -100 = uncertain (reject option) #
    predict.Y <- rep(-100, testn)
    ind1 <- which(f > tau)
    ind0 <- which(f < -tau)
    predict.Y[ind1] <- 1
    predict.Y[ind0] <- 0

    prediction[, curT] <- predict.Y
  }

  # for the final time grid, we only use the binary classification #
  fpcaFit <- list_fpcaFit[[nTotal - startT + 1]]

  # number of FPCs #
  K <- fpcaFit$npc

  # FPC basis functions #
  FPC.basis <- fpcaFit$efunctions

  # estimated mean function #
  mu <- fpcaFit$mu

  # estimated eigenvalues of the covariance operator, i.e., variances of FPC scores #
  evalues <- fpcaFit$evalues

  # estimated measurement error variance #
  sigma2 <- fpcaFit$sigma2

  # estimated FPC scores for subjects in the test data #
  estimated.FPC.score <- matrix(NA, nrow = testn, ncol = K)
  for (i in 1:testn) {
    ind <- which(!is.na(Xtest[i, ]))
    temp.matrix <- matrix(FPC.basis[ind, ], ncol = K)

    if (K == 1) {
      estimated.FPC.score[i, ] <- MASS::ginv(t(temp.matrix) %*% temp.matrix + sigma2 * evalues) %*% t(temp.matrix) %*% (Xtest[i, ind] - mu[ind])
    } else {
      estimated.FPC.score[i, ] <- MASS::ginv(t(temp.matrix) %*% temp.matrix + sigma2 * diag(evalues)) %*% t(temp.matrix) %*% (Xtest[i, ind] - mu[ind])
    }
  }

  # the parameter estimates at the last time grid #
  paraFit <- list_paraEst[[nTotal - startT + 1]]

  Hat.a0 <- paraFit$a0Hat
  Hat.a1 <- paraFit$a1Hat
  Hat.a2 <- paraFit$a2Hat

  if (is.vector(Ztest)) {
    f <- Hat.a0 + estimated.FPC.score %*% Hat.a1 + Hat.a2 * Ztest
  } else {
    f <- Hat.a0 + estimated.FPC.score %*% Hat.a1 + Ztest %*% Hat.a2
  }

  prediction[, nTotal] <- ifelse(stats::pnorm(f) > 0.5, 1, 0)

  cost <- rep(NA, testn)
  final.label <- rep(NA, testn)

  for(i in 1:testn) {
    cost[i] <- suppressWarnings(min(min(which(prediction[i, ] == 1)), min(which(prediction[i, ] == 0))))
    final.label[i] <- prediction[i, cost[i]]
  }

  avg.cost <- mean(cost)

  return (list("final.label" = final.label, "avg.cost" = avg.cost, "cost" = cost))
}


#' Reinforced risk prediction with budget constraint
#'
#' \code{reinforced} implements a cross-validation approach to find an optimal \eqn{\tau} such that the
#' misclassification error is minimized under a certain budget constraint.
#' @export
#' @param Y The outcome variable, vector of length \eqn{n}, taking values in \eqn{{1, 0, NA}}, where 1 = disease, 0 = not, NA = missing.
#' @param X Observed longitudinal biomarker, matrix of \eqn{n} by \eqn{nTotal}, where \eqn{nTotal} denotes the total number of time grids.
#' Missing values are denoted by NA.
#' @param Z Other baseline covariates.
#' @param budget The budget constraint. For instance, if the time grids are \eqn{{0,1/60,2/60,...,1}}. Budget = 30 means that the average follow up was no longer than 30 time grids.
#' This is equivalent to saying that on average, we want to make a definite prediction before time \eqn{t = 0.5}.
#' @param folds Folds in cross-validation, usually 5 or 10.
#' @param startT Time of the first prediction, denoted by \eqn{t_1} in the manuscript. For instance, if the time grids are \eqn{{0,1/60,2/60,...,1}},
#' then startT = 25 means that the first prediction is made at \eqn{t = 24/60}.
#' @param link The link function used in functional generalized linear models, e.g. "logit", "probit".
#' @param pve Proportion of variance explained in FPCA, default value is 0.99.
#' @param nbasis Number of B-spline basis functions needed for estimation of the mean function and smoothing of covariance.
#' Default value is 10 in refund package, sometimes a smaller number is needed when there are a small number of time grids.
#' @param weight A user-supplied weight for each individual. If the user did not supply the weight, we use an inverse probability
#' weighting method to calculate a weight. See details in section 3.4 of the manuscript.
#'
#' @return
#' \item{final.result}{The FPCA fit and the parameter estimates at each time grid from startT to the end.}
#' \item{final.tau}{The optimal \eqn{\tau} that minimizes the misclassification error under the budget constraint.}
#'
#' @examples
#' \donttest{
#' library(reinforcedPred)
#' set.seed(1)
#'
#' # take the example training data (univariate Z) from the reinforcedPred package
#' # see documentation for details about the data set train_data_uniZ
#' Y <- as.numeric(train_data_uniZ$Y)
#' tildeX.missing <- as.matrix(train_data_uniZ[,2:62])
#' Z <- as.numeric(train_data_uniZ$Z)
#'
#' # analysis starts
#' budget <- 45
#' folds <- 5
#' startT <- 25
#' link <- "probit"
#'
#' result <- reinforced(Y, tildeX.missing, Z, budget, folds, startT, link, pve = 0.99, nbasis = 10)
#'
#' # obtained parameter estimates and FPCA decompositions
#' list_paraEst <- (result$final.result)$list_paraEst
#' list_fpcaFit <- (result$final.result)$list_fpcaFit
#'
#' # optimal tau that minimizes the misclassification error under the budget constraint
#' final.tau <- result$final.tau
#' final.tau
#'
#' # use the fitted model to predict the label Y for subjects in the test data
#' # see documentation for details about the data set test_data_uniZ
#' testY <- as.numeric(test_data_uniZ$testY)
#' test.tildeX.missing <- as.matrix(test_data_uniZ[,2:62])
#' test.Z <- as.numeric(test_data_uniZ$test.Z)
#'
#' pred <- modelPredict(list_fpcaFit, list_paraEst, test.tildeX.missing, test.Z, startT, final.tau)
#'
#' # predicted outcome Y for each subject in the test data
#' predY.test <- pred$final.label
#' # misclassification error
#' mis.error <- sum(predY.test != testY, na.rm = TRUE) / sum(!is.na(testY))
#' mis.error
#'
#' # the average cost when we applied the prediction procedure to the test data
#' pred$avg.cost
#' }


reinforced <- function(Y, X, Z, budget, folds, startT, link, pve = 0.99, nbasis = 10, weight) {

  n <- length(Y)                  # sample size

  # if the user does not supply the weight #
  if (nargs() < 10) {

    # FPCA based on all longitudinal information #
    full.fpcaFit <- refund::fpca.sc(Y = X, var = TRUE, pve = pve)

    # FPC scores: this term will be later used to model the probability that Y is missing #
    full.FPC.score <- full.fpcaFit$scores

    R <- !is.na(Y)                 # indicator variable for the non-missingness of the outcome variable Y #

    if (sum(R) == length(Y)) {
      # if there is no missing value for Y #
      weight <- rep(1, n)
    } else {
      # if outcome is missing for some subjects #

      # model the probability that the outcome is missing #
      fit.propen <- stats::glm(R ~ full.FPC.score + Z, family = "binomial")
      prob <- fit.propen$fitted.values
      weight <- 1 / prob
    }

  }

  # we search over a sequence of tau values #
  tau.sequence <- seq(0, 3, 0.02)

  # record the misclassification error and the cost, each row is for fixed fold with different taus #
  error.matrix <- matrix(NA, nrow = folds, ncol = length(tau.sequence))
  cost.matrix <- matrix(NA, nrow = folds, ncol = length(tau.sequence))

  # cross-validation procedure #
  samplen = sample(n)

  for (i in 1:folds) {
    tst_idx = samplen[seq(i, n, by = folds)]                   # index set for test data
    trn_idx = setdiff(1:n, tst_idx)                            # index set for training data

    trn_Y <- Y[trn_idx]
    trn_X <- X[trn_idx, ]
    trn_weight <- weight[trn_idx]

    tst_Y <- Y[tst_idx]
    tst_X <- X[tst_idx, ]

    if (is.vector(Z)) {
      trn_Z <- Z[trn_idx]
      tst_Z <- Z[tst_idx]
    } else {
      trn_Z <- Z[trn_idx, ]
      tst_Z <- Z[tst_idx, ]
    }

    # From the training set, we fit FPCA decompositions and obtain the parameter estimates #
    Fit <- modelFit(trn_Y, trn_X, trn_Z, startT, link, pve, nbasis, trn_weight)
    list_fpcaFit <- Fit$list_fpcaFit
    list_paraEst <- Fit$list_paraEst

    for (j in 1:length(tau.sequence)) {
      curr.tau <- tau.sequence[j]
      # use the fitted model to make the prediction for individuals in the test set #
      result <- modelPredict(list_fpcaFit, list_paraEst, tst_X, tst_Z, startT, curr.tau)
      error.matrix[i, j] <- sum(result$final.label != tst_Y, na.rm = TRUE) / sum(!is.na(tst_Y))
      cost.matrix[i, j] <- result$avg.cost
    }
  }

  test.error <- colMeans(error.matrix)
  test.cost <- colMeans(cost.matrix)

  index.cost <- which(test.cost <= budget)                  # index set for tau that satisfies budget constraint

  tau.sequence.satisfied <- tau.sequence[index.cost]
  test.error.satisfied <- test.error[index.cost]
  test.cost.satisfied <- test.cost[index.cost]

  final.tau <- tau.sequence.satisfied[max(which(test.error.satisfied == min(test.error.satisfied)))]

  final.result <- modelFit(Y, X, Z, startT, link, pve, nbasis, weight)

  return (list("final.result" = final.result, "final.tau" = final.tau))
}


#' Example training data (univariate Z)
#'
#' @format A data frame with 100 rows and 63 columns. The 1st column is the outcome variable Y, starting from the 2nd
#' column to 62nd column is the longitudinal biomarker at 61 time grids, the 63rd column is other baseline covariate Z.
#'
#' @source A simulated data set
"train_data_uniZ"

#' Example test data (univariate Z)
#'
#' @format A data frame with 2000 rows and 63 columns. The 1st column is the outcome variable Y, starting from the 2nd
#' column to 62nd column is the longitudinal biomarker at 61 time grids, the 63rd column is other baseline covariate Z.
#'
#' @source A simulated data set
"test_data_uniZ"

Try the reinforcedPred package in your browser

Any scripts or data that you put into this service are public.

reinforcedPred documentation built on May 2, 2019, 4:17 a.m.