R/pssmooth.R

Defines functions plotMCEPcurve testEquality testConstancy summary.riskCurve invtContrastRiskCurve tContrastRiskCurve contrastRiskCurve riskCurve bootRiskCurve risk riskV riskP propX rowMatch hDen hNum tpsPredict expit logit

Documented in bootRiskCurve plotMCEPcurve riskCurve summary.riskCurve testConstancy testEquality

#' @import graphics
NULL
#' @import stats
NULL
#' @import utils
NULL
#' @importFrom osDesign tps
NULL
#' @import np
NULL
#' @import chngpt
NULL
#' @import MASS
NULL

globalVariables(c("Z","S","Y"))

logit <- function(p){
  return(log(p/(1-p)))
}

expit <- function(x){
  return(exp(x)/(1+exp(x)))
}

# 'tpsPredict' returns predicted values from a model fitted by tps
# columns of newMatrix in the same order as the coefficient vector from tps
tpsPredict <- function(fit, newMatrix){
  linPred <- newMatrix %*% fit$coef
  return(drop(1/(1+exp(-linPred))))
}

# 'hNum' returns function values at s0 of the integrand in the numerator of P{Y(0)=1 | S(1)=s1}
# s0 is a numeric vector, whereas s1 is a scalar
hNum <- function(s0, s1, lev, vars, data, pstype, bsmtype, tpsFit, changePoint=NULL, npcdensFit1, npcdensFit2){
  ss0 <- s0
  if (!is.null(changePoint)){ ss0 <- ifelse(s0>changePoint, s0-changePoint, 0) }

  # if any baseline covariates are specified
  if (length(vars)>=1){
    phat.s0 <- tpsPredict(tpsFit, cbind(1, ss0, matrix(lev[-1], nrow=length(ss0), ncol=length(lev[-1]), byrow=TRUE)))

    lev1names <- names(which(lev==1))
    predictLevels <- NULL
    for (vi in 1:length(vars)){
      varLevels <- levels(as.factor(data[,vars[vi]]))
      predictLevelVar <- NULL
      for (li in 2:length(varLevels)){
        if (any(grepl(vars[vi], lev1names) & grepl(varLevels[li], lev1names))){
          predictLevelVar <- varLevels[li]
          break
        }
      }
      if (is.null(predictLevelVar)){ predictLevelVar <- varLevels[1] }
      predictLevels <- c(predictLevels, predictLevelVar)
    }

    dfVars <- as.data.frame(matrix(predictLevels, nrow=length(s0), ncol=length(predictLevels), byrow=TRUE))
    colnames(dfVars) <- vars

    if (pstype=="ordered"){ s1 <- ordered(s1) }
    if (bsmtype=="ordered"){ s0 <- ordered(s0) }
    fhat.s0 <- predict(npcdensFit1, newdata=data.frame(Sb=s0, dfVars, S=s1))
    ghat.s0 <- predict(npcdensFit2, newdata=data.frame(dfVars, S=s0))
  } else {
    phat.s0 <- tpsPredict(tpsFit, cbind(1, ss0))
    if (pstype=="ordered"){ s1 <- ordered(s1) }
    if (bsmtype=="ordered"){ s0 <- ordered(s0) }
    fhat.s0 <- predict(npcdensFit1, newdata=data.frame(Sb=s0, S=s1))
    ghat.s0 <- predict(npcdensFit2, newdata=data.frame(S=s0))
  }

  return(phat.s0*fhat.s0*ghat.s0)
}

# 'hDen' returns function values at s0 of the integrand in the denominator of risk_{(0)}(s_1)
# s0 is a numeric vector, whereas s1 is a scalar
hDen <- function(s0, s1, lev, vars, data, pstype, bsmtype, npcdensFit1, npcdensFit2){
  # if any baseline covariates are specified
  if (length(vars)>=1){
    lev1names <- names(which(lev==1))
    predictLevels <- NULL
    for (vi in 1:length(vars)){
      varLevels <- levels(as.factor(data[,vars[vi]]))
      predictLevelVar <- NULL
      for (li in 2:length(varLevels)){
        if (any(grepl(vars[vi], lev1names) & grepl(varLevels[li], lev1names))){
          predictLevelVar <- varLevels[li]
          break
        }
      }
      if (is.null(predictLevelVar)){ predictLevelVar <- varLevels[1] }
      predictLevels <- c(predictLevels, predictLevelVar)
    }

    dfVars <- as.data.frame(matrix(predictLevels, nrow=length(s0), ncol=length(predictLevels), byrow=TRUE))
    colnames(dfVars) <- vars

    fhat.s0 <- predict(npcdensFit1, newdata=data.frame(Sb=s0, dfVars, S=s1))
    ghat.s0 <- predict(npcdensFit2, newdata=data.frame(dfVars, S=s0))
  } else {
    fhat.s0 <- predict(npcdensFit1, newdata=data.frame(Sb=s0, S=s1))
    ghat.s0 <- predict(npcdensFit2, newdata=data.frame(S=s0))
  }
  return(fhat.s0*ghat.s0)
}

rowMatch <- function(x, y){
  return(sum(abs(x-y))==0)
}

propX <- function(X, lev){
  Xmatch <- apply(as.matrix(X), 1, rowMatch, y=lev)
  return(mean(Xmatch))
}

# 'riskP' returns the value of P{Y(0)=1 | S(1)=s1}
# s1 is a scalar
# all baseline covariates are assumed to be discrete variables with a finite number of categories
# 'formula' is a one-sided formula of baseline covariates
riskP <- function(s1, formula, data, pstype, bsmtype, tpsFit, npcdensFit1, npcdensFit2, changePoint=NULL){
  den <- num <- 0
  UL <- 1.05 * max(data$S, na.rm=TRUE) # if integration over (0,Inf) fails, use (0,UL)

  mf <- model.frame(formula, data)
  X <- model.matrix(terms(formula), mf)
  Xu <- unique(X)
  vars <- all.vars(formula)
  for (i in 1:NROW(Xu)){
    lev <- drop(Xu[i,])
    names(lev) <- colnames(Xu)
    pX <- propX(X, lev)
    # if both 'pstype' and 'bsmtype' are ordered categorical, replace numerical integration with simple summation because 'predict' on objects from 'npudens' and 'npcdens' assigns probability mass to values
    # other than the discrete values taken on by the categorical variables (consequently, the sum of the probability mass would exceed 1)
    if (pstype=="ordered" & bsmtype=="ordered"){
      num <- num + pX*sum(hNum(s0=as.numeric(as.vector(sort(unique(data$S)))), s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, tpsFit=tpsFit, changePoint=changePoint, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2))
      den <- den + pX*sum(hDen(s0=as.numeric(as.vector(sort(unique(data$S)))), s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2))
    } else {
      hNumInt <- try(integrate(hNum, 0, Inf, s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, tpsFit=tpsFit, changePoint=changePoint, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2, subdivisions=2000)$value, silent=TRUE)
      if (inherits(hNumInt, 'try-error')){
        hNumInt <- try(integrate(hNum, 0, UL, s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, tpsFit=tpsFit, changePoint=changePoint, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2, subdivisions=2000)$value, silent=TRUE)
        
        if (inherits(hNumInt, 'try-error')){
          hNumInt <- try(integrate(hNum, 0, UL, s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, tpsFit=tpsFit, changePoint=changePoint, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2, subdivisions=2000, rel.tol=30*.Machine$double.eps^0.25)$value, silent=TRUE)
        }
      }
      
      if (inherits(hNumInt, 'try-error')){
        return(NA)
      } else {
        num <- num + pX*hNumInt  
      }
      
      hDenInt <- try(integrate(hDen, 0, UL, s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2, subdivisions=2000, rel.tol=30*.Machine$double.eps^0.25)$value, silent=TRUE)
      if (inherits(hDenInt, 'try-error')){
        hDenInt <- try(integrate(hDen, 0, UL, s1=s1, lev=lev, vars=vars, data=data, pstype=pstype, bsmtype=bsmtype, npcdensFit1=npcdensFit1, npcdensFit2=npcdensFit2, subdivisions=2000, rel.tol=.Machine$double.eps^0.05)$value, silent=TRUE)
      }
      
      if (inherits(hDenInt, 'try-error')){
        return(NA)
      } else {
        den <- den + pX*hDenInt  
      }
    }
  }

  return(num/den)
}

# 'riskV' returns the value of P{Y(1)=1 | S(1)=s1}
# s1 is a scalar
riskV <- function(s1, data, data2, changePoint=NULL){
  nTControls <- NROW(subset(data, Z==1 & Y==0))
  nTCases <- NROW(subset(data, Z==1 & Y==1))
  tpsFormula <- paste0("Y~",ifelse(is.null(changePoint),"S","Sc"))
  fit <- tps(tpsFormula, data=subset(data2, Z==1 & !is.na(Y)), nn0=nTControls, nn1=nTCases, group=rep(1, NROW(subset(data2, Z==1 & !is.na(Y)))), method="PL", cohort=TRUE)
  if (!is.null(changePoint)){ s1 <- ifelse(s1>changePoint,s1-changePoint,0) }
  return(tpsPredict(fit, cbind(1, s1)))
}

# 'risk' returns the estimates of risk in each study group for a given s1
# s1 is a scalar
risk <- function(s1, formula, data, data2, pstype, bsmtype, tpsFit, npcdensFit1, npcdensFit2, changePoint=NULL){
  risk1 <- riskV(s1, data, data2, changePoint)
  risk0 <- riskP(s1, formula, data, pstype, bsmtype, tpsFit, npcdensFit1, npcdensFit2, changePoint)
  return(list(plaRisk=risk0, txRisk=risk1))
}

#' Bootstrap Estimation of Conditional Clinical Endpoint Risk under Placebo and Treatment Given Biomarker Response to Treatment in a Baseline Surrogate Measure
#' Three-Phase Sampling Design
#'
#' Estimates \eqn{P\{Y(z)=1|S(1)=s_1\}}, \eqn{z=0,1}, on a grid of \eqn{s_1} values in bootstrap resamples (see \code{\link{riskCurve}} for notation introduction). Cases
#' (\eqn{Y=1}) and controls (\eqn{Y=0}) are sampled separately yielding a fixed number of cases and controls in each bootstrap sample. Consequentially, the number of controls
#' with available phase 2 data varies across bootstrap samples.
#'
#' @param formula a formula object with the binary clinical endpoint on the left of the \code{~} operator. The first listed variable on the right must be the biomarker response
#' at \eqn{t0} and all variables that follow, if any, are discrete baseline covariates specified in all fitted models that condition on them. Interactions and transformations
#' of the baseline covariates are allowed. All terms in the formula must be evaluable in the data frame \code{data}.
#' @param bsm a character string specifying the variable name in \code{data} representing the baseline surrogate measure
#' @param tx a character string specifying the variable name in \code{data} representing the treatment group indicator
#' @param data a data frame with one row per randomized participant endpoint-free at \eqn{t_0} that contains at least the variables specified in \code{formula}, \code{bsm} and
#' \code{tx}. Values of \code{bsm} and the biomarker at \eqn{t_0} that are unavailable are represented as \code{NA}.
#' @param pstype a character string specifying whether the biomarker response shall be treated as a \code{continuous} (default) or \code{ordered} categorical variable in the
#' kernel density/probability estimation
#' @param bsmtype a character string specifying whether the baseline surrogate measure shall be treated as a \code{continuous} (default) or \code{ordered} categorical variable in the
#' kernel density/probability estimation
#' @param bwtype a character string specifying the bandwidth type for continuous variables in the kernel density estimation. The options are \code{fixed} (default) for fixed
#' bandwidths, \code{generalized_nn} for generalized nearest neighbors, and \code{adaptive_nn} for adaptive nearest neighbors. As noted in the documentation of the function
#' \code{npcdensbw} in the \code{np} package: "Adaptive nearest-neighbor bandwidths change with each sample realization in the set when estimating the density at the point \eqn{x}.
#' Generalized nearest-neighbor bandwidths change with the point at which the density is estimated, \eqn{x}. Fixed bandwidths are constant over the support of \eqn{x}."
#' @param hinge a logical value (\code{FALSE} by default) indicating whether a hinge model (Fong et al., 2017) shall be used for modeling the effect of \eqn{S(z)} on the
#' clinical endpoint risk. A hinge model specifies that variability in \eqn{S(z)} below the hinge point does not associate with the clinical endpoint risk. The hinge point
#' is reestimated in each bootstrap sample.
#' @param weights either a numeric vector of weights or a character string specifying the variable name in \code{data} representing weights applied to observations
#' in the phase 2 subset in order to make inference about the target population of all randomized participants endpoint-free at \eqn{t_0}. The weights reflect that
#' the case:control ratio in the phase 2 subset is different from that in the target population and are passed on to GLMs in the estimation of the hinge point.
#' If \code{NULL} (default and recommended), weights for cases and controls are recalculated separately in each study group \emph{within each bootstrap sample}; otherwise the
#' same specified vector of weights is used in each bootstrap sample.
#' @param psGrid a numeric vector of \eqn{S(1)} values at which the conditional clinical endpoint risk in each study group is estimated. If \code{NULL} (default),
#' a grid of values spanning the range of observed values of the biomarker will be used.
#' @param iter the number of bootstrap iterations
#' @param seed a seed of the random number generator supplied to \code{set.seed} for reproducibility
#' @param saveFile a character string specifying the name of an \code{.RData} file storing the output list. If \code{NULL} (default), the output list will only be returned.
#' @param saveDir a character string specifying a path for the output directory. If \code{NULL} (default), the output list will only be returned; otherwise, if
#' \code{saveFile} is specified, the output list will also be saved as an \code{.RData} file in the specified directory.
#'
#' @return If \code{saveFile} and \code{saveDir} are both specified, the output list (named \code{bList}) is saved as an \code{.RData} file; otherwise it is returned only.
#' The output object is a list with the following components:
#' \itemize{
#' \item \code{psGrid}: a numeric vector of \eqn{S(1)} values at which the conditional clinical endpoint risk is estimated in the components \code{plaRiskCurveBoot} and
#' \code{txRiskCurveBoot}
#' \item \code{plaRiskCurveBoot}: a \code{length(psGrid)}-by-\code{iter} matrix of estimates of \eqn{P\{Y(0)=1|S(1)=s_1\}} for \eqn{s_1} in \code{psGrid},
#' with columns representing bootstrap samples
#' \item \code{txRiskCurveBoot}: a \code{length(psGrid)}-by-\code{iter} matrix of estimates of \eqn{P\{Y(1)=1|S(1)=s_1\}} for \eqn{s_1} in \code{psGrid},
#' with columns representing bootstrap samples
#' \item \code{cpointPboot}: if \code{hinge=TRUE}, a numeric vector of estimates of the hinge point in the placebo group in each bootstrap sample
#' \item \code{cpointTboot}: if \code{hinge=TRUE}, a numeric vector of estimates of the hinge point in the treatment group in each bootstrap sample
#' }
#'
#' @references Fong, Y., Huang, Y., Gilbert, P. B., and Permar, S. R. (2017), chngpt: threshold regression model estimation and inference, \emph{BMC Bioinformatics}, 18.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' X <- rbinom(n,1,0.5)
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.4)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(X,Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("X","Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=3)
#'
#' out <- bootRiskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data,
#'                      psGrid=grid, iter=1, seed=10)
#' \donttest{
#' # alternatively, to save the .RData output file (no '<-' needed):
#' bootRiskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data,
#'               psGrid=grid, iter=1, seed=10, saveFile="out.RData", saveDir="./")
#' }
#'
#' @seealso \code{\link{riskCurve}}, \code{\link{summary.riskCurve}} and \code{\link{plotMCEPcurve}}
#' @export
bootRiskCurve <- function(formula, bsm, tx, data, pstype=c("continuous", "ordered"), bsmtype=c("continuous", "ordered"), bwtype=c("fixed", "generalized_nn", "adaptive_nn"),
                          hinge=FALSE, weights=NULL, psGrid=NULL, iter, seed=NULL, saveFile=NULL, saveDir=NULL){
  if (missing(bsm)){ stop("The variable name in argument 'bsm' for the baseline surrogate measure is missing.") }
  if (missing(tx)){ stop("The variable name in argument 'tx' for the treatment group indicator is missing.") }
  if (missing(data)){ stop("The data frame 'data' for interpreting the variables in 'formula' is missing.") }
  if (missing(iter)){ stop("The number of bootstrap iterations in argument 'iter' is missing.") }

  pstype <- match.arg(pstype)
  bsmtype <- match.arg(bsmtype)
  bwtype <- match.arg(bwtype)

  if (!is.null(weights)){
    if (is.character(weights)){
      colnames(data)[colnames(data)==weights] <- "weights"
    } else {
      # assuming 'weights' is a numeric vector
      data$weights <- weights
    }
  }

  formulaDecomp <- strsplit(strsplit(paste(deparse(formula), collapse = ""), " *[~] *")[[1]], " *[+] *")
  anyBaselineCovar <- length(formulaDecomp[[2]])>1

  # standardize the variable name for the treatment group indicator
  colnames(data)[colnames(data)==tx] <- "Z"

  # standardize the variable name for the biomarker measurement at fixed time t0 post-randomization
  # this line assumes that it is the first listed variable on the RHS of 'formula'
  colnames(data)[colnames(data)==formulaDecomp[[2]][1]] <- "S"

  # standardize the variable name for the biomarker's baseline measurement
  colnames(data)[colnames(data)==bsm] <- "Sb"

  # standardize the variable name for the binary clinical endpoint measured after t0
  colnames(data)[colnames(data)==formulaDecomp[[1]]] <- "Y"

  # extract the subset with available phase 2 data (i.e., with measured S)
  data2 <- subset(data, !is.na(S))

  # case deletion method applied to 'data2' to match the case:control ratio in 'data', separately in each study group,
  # in order to be able to use the standard kernel density estimation procedure
  dataControls <- subset(data, Y==0)
  nControls <- NROW(dataControls)
  nPControls <- NROW(subset(dataControls, Z==0))
  nTControls <- NROW(subset(dataControls, Z==1))

  nPControls2 <- NROW(dataPControls2 <- subset(data2, Z==0 & Y==0))
  nTControls2 <- NROW(dataTControls2 <- subset(data2, Z==1 & Y==0))

  dataCases <- subset(data, Y==1)
  nCases <- NROW(dataCases)
  nPCases <- NROW(subset(dataCases, Z==0))
  nTCases <- NROW(subset(dataCases, Z==1))

  nPCases2 <- NROW(dataPCases2 <- subset(data2, Z==0 & Y==1))
  nTCases2 <- NROW(dataTCases2 <- subset(data2, Z==1 & Y==1))

  # in each study group separately, calculate the required number of cases in 'data2' to match the case:control ratio in 'data'
  nPCases2new <- nPCases * nPControls2 / nPControls
  nTCases2new <- nTCases * nTControls2 / nTControls

  # in each study group separately, randomly sample cases to match the case:control ratio in 'data'
  dataP2correctRatio <- rbind(dataPControls2, dataPCases2[sample(1:nPCases2, max(1,round(nPCases2new,0))),])
  dataT2correctRatio <- rbind(dataTControls2, dataTCases2[sample(1:nTCases2, max(1,round(nTCases2new,0))),])
  rm(dataPControls2); rm(dataPCases2); rm(dataTControls2); rm(dataTCases2)

  # estimate optimal bandwidths for kernel estimates of the conditional (or unconditional) densities
  Sterm <- "S"
  if (pstype=="ordered"){ Sterm <- "ordered(S)"}
  Sbterm <- "Sb"
  if (bsmtype=="ordered"){ Sbterm <- "ordered(Sb)"}

  if (anyBaselineCovar){
    fm.fbw <- as.formula(paste0(Sterm," ~ ",paste(c(Sbterm,formulaDecomp[[2]][-1]),collapse="+")))
    fm.gbw <- as.formula(paste0(Sterm," ~ ",paste(formulaDecomp[[2]][-1],collapse="+")))
    gbw <- npcdensbw(fm.gbw, data=dataP2correctRatio, cxkertype="epanechnikov", cykertype="epanechnikov", bwtype=bwtype)
  } else {
    fm.fbw <- as.formula(paste0(Sterm," ~ ",Sbterm))
    fm.gbw <- as.formula(paste0("~ ",Sterm))
    # marginal density
    gbw <- npudensbw(fm.gbw, data=dataP2correctRatio, ckertype="epanechnikov", bwtype=bwtype)
  }
  fbw <- npcdensbw(fm.fbw, data=dataT2correctRatio, cxkertype="epanechnikov", cykertype="epanechnikov", bwtype=bwtype)

  if(!is.null(seed)){ set.seed(seed) }
  bSampleControls <- matrix(sample(1:nControls, nControls*iter, replace=TRUE), nrow=nControls, ncol=iter)
  bSampleCases <- matrix(sample(1:nCases, nCases*iter, replace=TRUE), nrow=nCases, ncol=iter)

  # a grid of values of S on which the bootstrapped risk curves are returned
  if (is.null(psGrid)){
    psGrid <- seq(min(data2$S), max(data2$S), length.out=200)
  }
  rm(data2)

  bRiskCurveList <- lapply(1:iter, function(i, formulaDecomp, Sterm, Sbterm){
    # create a bootstrap sample
    bdata <- rbind(dataControls[bSampleControls[,i],], dataCases[bSampleCases[,i],])
    # extract the bootstrap subset with phase 2 data
    bdata2 <- subset(bdata, !is.na(S))

    # estimate the hinge point in each bootstrap iteration
    if (hinge){
      # recalculate weights in each bootstrap iteration for passing on to 'chngptm'
      if (is.null(weights)){
        bdata2$weights <- NA
        bdata2$weights <- ifelse(bdata2$Z==0 & bdata2$Y==0, nPControls/nPControls2, bdata2$weights)
        bdata2$weights <- ifelse(bdata2$Z==1 & bdata2$Y==0, nTControls/nTControls2, bdata2$weights)
        bdata2$weights <- ifelse(bdata2$Z==0 & bdata2$Y==1, nPCases/nPCases2, bdata2$weights)
        bdata2$weights <- ifelse(bdata2$Z==1 & bdata2$Y==1, nTCases/nTCases2, bdata2$weights)
      }

      # in each study group separately, estimate the hinge point for the association of S with Y
      if (anyBaselineCovar){
        fm <- as.formula(paste0("Y ~ ",paste(formulaDecomp[[2]][-1],collapse="+")))
      } else {
        fm <- Y ~ 1
      }
      cpointP <- chngptm(formula.1=fm, formula.2=~S, data=subset(bdata2, Z==0 & !is.na(Y)), family="binomial", type="hinge", weights=subset(bdata2, Z==0 & !is.na(Y))$weights)$coefficients["chngpt"]
      cpointT <- chngptm(formula.1=Y ~ 1, formula.2=~S, data=subset(bdata2, Z==1 & !is.na(Y)), family="binomial", type="hinge", weights=subset(bdata2, Z==1 & !is.na(Y))$weights)$coefficients["chngpt"]
      # use their minimum as the hinge point in the below specified GLMs
      cpoint <- min(cpointP, cpointT)

      # biomarker S left-censored at 'cpoint' for use in the below specified GLMs
      bdata$Sc <- with(bdata, ifelse(S>cpoint, S-cpoint, 0))

      # re-extract the subset with phase 2 data
      bdata2 <- subset(bdata, !is.na(S))

      # IPW logistic regression model fitted to placebo recipients in the phase 2 subset accounting for two-phase sampling of S
      if (anyBaselineCovar){
        fm <- as.formula(paste0("Y ~ ",paste(c("Sc",formulaDecomp[[2]][-1]),collapse="+")))
      } else {
        fm <- Y ~ Sc
      }
      fit1 <- tps(fm, data=subset(bdata2, Z==0 & !is.na(Y)), nn0=NROW(subset(bdata, Z==0 & Y==0)), nn1=NROW(subset(bdata, Z==0 & Y==1)), group=rep(1, NROW(subset(bdata2, Z==0 & !is.na(Y)))), method="PL", cohort=TRUE)
    } else {
      # for passing on to the function 'risk'
      cpoint <- NULL

      # IPW logistic regression model fitted to placebo recipients in the phase 2 subset accounting for two-phase sampling of S
      if (anyBaselineCovar){
        fm <- as.formula(paste0("Y ~ ",paste(c("S",formulaDecomp[[2]][-1]),collapse="+")))
      } else {
        fm <- Y ~ S
      }
      fit1 <- tps(fm, data=subset(bdata2, Z==0 & !is.na(Y)), nn0=NROW(subset(bdata, Z==0 & Y==0)), nn1=NROW(subset(bdata, Z==0 & Y==1)), group=rep(1, NROW(subset(bdata2, Z==0 & !is.na(Y)))), method="PL", cohort=TRUE)
    }

    bdataControls <- subset(bdata, Y==0)
    nPControls2 <- NROW(bdataPControls2 <- subset(bdata2, Z==0 & Y==0))
    nTControls2 <- NROW(bdataTControls2 <- subset(bdata2, Z==1 & Y==0))

    bdataCases <- subset(bdata, Y==1)
    nPCases2 <- NROW(bdataPCases2 <- subset(bdata2, Z==0 & Y==1))
    nTCases2 <- NROW(bdataTCases2 <- subset(bdata2, Z==1 & Y==1))

    nPControls <- NROW(subset(bdataControls, Z==0))
    nTControls <- NROW(subset(bdataControls, Z==1))
    nPCases <- NROW(subset(bdataCases, Z==0))
    nTCases <- NROW(subset(bdataCases, Z==1))

    # in each study group separately, recalculate the required number of cases in 'bdata2' to match the case:control ratio in 'bdata'
    nPCases2new <- nPCases * nPControls2 / nPControls
    nTCases2new <- nTCases * nTControls2 / nTControls

    # in each study group separately, randomly sample cases to match the case:control ratio in 'bdata'
    bdataP2correctRatio <- rbind(bdataPControls2, bdataPCases2[sample(1:nPCases2, max(1,round(nPCases2new,0))),])
    bdataT2correctRatio <- rbind(bdataTControls2, bdataTCases2[sample(1:nTCases2, max(1,round(nTCases2new,0))),])
    rm(bdataPControls2); rm(bdataPCases2); rm(bdataTControls2); rm(bdataTCases2)

    # ensure that all factor levels of S and Sb in the data used for computing 'fbw' and 'gbw' are included in the data used for computing 'bfbw' and 'bgbw'
    if (Sterm=="ordered(S)"){
      bdataP2correctRatio$S <- factor(bdataP2correctRatio$S, levels=levels(ordered(dataP2correctRatio$S)), ordered=TRUE)
      bdataT2correctRatio$S <- factor(bdataT2correctRatio$S, levels=levels(ordered(dataT2correctRatio$S)), ordered=TRUE)
    }

    if (Sbterm=="ordered(Sb)"){
      bdataT2correctRatio$Sb <- factor(bdataT2correctRatio$Sb, levels=levels(ordered(dataT2correctRatio$Sb)), ordered=TRUE)
    }

    # kernel density estimator for g(s0|X=x) using the placebo group in the phase 2 subset
    if (anyBaselineCovar){
      # the formulas are reintroduced in the 'lapply' so that 'npcdensbw' and 'npudensbw' are able to evaluate them in the parent.frame() environment
      fm.fbw <- as.formula(paste0(Sterm," ~ ",paste(c(Sbterm,formulaDecomp[[2]][-1]),collapse="+")))
      fm.gbw <- as.formula(paste0(Sterm," ~ ",paste(formulaDecomp[[2]][-1],collapse="+")))
      bgbw <- npcdensbw(fm.gbw, data=bdataP2correctRatio, bws=gbw, bandwidth.compute=FALSE)
      ghat <- npcdens(bgbw)
    } else {
      fm.fbw <- as.formula(paste0(Sterm," ~ ",Sbterm))
      fm.gbw <- as.formula(paste0("~ ",Sterm))
      # marginal density
      bgbw <- npudensbw(fm.gbw, data=bdataP2correctRatio, bws=gbw, bandwidth.compute=FALSE)
      ghat <- npudens(bgbw)
    }

    # kernel density estimator for f(s1|Sb=s0, X=x) using the treatment group in the phase 2 subset
    bfbw <- npcdensbw(fm.fbw, data=bdataT2correctRatio, bws=fbw, bandwidth.compute=FALSE)
    fhat <- npcdens(bfbw)

    # the first argument of 'risk' is a scalar
    if (anyBaselineCovar){
      fm <- as.formula(paste0("~",paste(formulaDecomp[[2]][-1],collapse="+")))
    } else {
      fm <- ~ 1
    }
    curves <- lapply(psGrid, function(s){ risk(s, fm, bdata, bdata2, pstype, bsmtype, fit1, fhat, ghat, cpoint) })
    plaRiskCurve <- sapply(curves, "[[", "plaRisk")
    txRiskCurve <- sapply(curves, "[[", "txRisk")

    # the output list
    out <- list(plaRiskCurve=plaRiskCurve, txRiskCurve=txRiskCurve)
    if (hinge){
      out$cpointP <- cpointP
      out$cpointT <- cpointT
    }

    return(out)
  }, formulaDecomp=formulaDecomp, Sterm=Sterm, Sbterm=Sbterm)

  # cbind all bootstrap risk curves
  plaRiskCurveBoot <- sapply(bRiskCurveList,"[[","plaRiskCurve")
  txRiskCurveBoot <- sapply(bRiskCurveList,"[[","txRiskCurve")

  # the output list
  bList <- list(psGrid=psGrid, plaRiskCurveBoot=plaRiskCurveBoot, txRiskCurveBoot=txRiskCurveBoot)
  if (hinge){
    bList$cpointPboot <- sapply(bRiskCurveList,"[[","cpointP")
    bList$cpointTboot <- sapply(bRiskCurveList,"[[","cpointT")
  }

  if (!is.null(saveFile)){
    save(bList, file=file.path(saveDir, saveFile))
    cat("Output saved in:\n",file.path(saveDir, saveFile),"\n")
  }

  return(invisible(bList))
}

#' Estimation of Conditional Clinical Endpoint Risk under Placebo and Treatment Given Biomarker Response to Treatment in a Baseline Surrogate Measure Three-Phase Sampling Design
#'
#' Estimates \eqn{P\{Y(z)=1|S(1)=s_1\}}, \eqn{z=0,1}, on a grid of \eqn{s_1} values following the estimation method of Juraska, Huang, and Gilbert (2018), where \eqn{Z} is the
#' treatment group indicator (\eqn{Z=1}, treatment; \eqn{Z=0}, placebo), \eqn{S(z)} is a continuous or ordered categorical univariate biomarker under assignment to \eqn{Z=z}
#' measured at fixed time \eqn{t_0} after randomization, and \eqn{Y} is a binary clinical endpoint (\eqn{Y=1}, disease; \eqn{Y=0}, no disease) measured after \eqn{t_0}. The
#' estimator employs the generalized product kernel density/probability estimation method of Hall, Racine, and Li (2004) implemented in the \code{np} package. The risks
#' \eqn{P\{Y(z)=1|S(z)=s_1,X=x\}}, \eqn{z=0,1}, where \eqn{X} is a vector of discrete baseline covariates, are estimated by fitting inverse probability-weighted logistic regression
#' models using the \code{osDesign} package.
#'
#' @param formula a formula object with the binary clinical endpoint on the left of the \code{~} operator. The first listed variable on the right must be the biomarker response
#' at \eqn{t0} and all variables that follow, if any, are discrete baseline covariates specified in all fitted models that condition on them. Interactions and transformations
#' of the baseline covariates are allowed. All terms in the formula must be evaluable in the data frame \code{data}.
#' @param bsm a character string specifying the variable name in \code{data} representing the baseline surrogate measure
#' @param tx a character string specifying the variable name in \code{data} representing the treatment group indicator
#' @param data a data frame with one row per randomized participant endpoint-free at \eqn{t_0} that contains at least the variables specified in \code{formula}, \code{bsm} and
#' \code{tx}. Values of \code{bsm} and the biomarker at \eqn{t_0} that are unavailable are represented as \code{NA}.
#' @param pstype a character string specifying whether the biomarker response shall be treated as a \code{continuous} (default) or \code{ordered} categorical variable in the
#' kernel density/probability estimation
#' @param bsmtype a character string specifying whether the baseline surrogate measure shall be treated as a \code{continuous} (default) or \code{ordered} categorical variable in the
#' kernel density/probability estimation
#' @param bwtype a character string specifying the bandwidth type for continuous variables in the kernel density estimation. The options are \code{fixed} (default) for fixed
#' bandwidths, \code{generalized_nn} for generalized nearest neighbors, and \code{adaptive_nn} for adaptive nearest neighbors. As noted in the documentation of the function
#' \code{npcdensbw} in the \code{np} package: "Adaptive nearest-neighbor bandwidths change with each sample realization in the set when estimating the density at the point \eqn{x}.
#' Generalized nearest-neighbor bandwidths change with the point at which the density is estimated, \eqn{x}. Fixed bandwidths are constant over the support of \eqn{x}."
#' @param hinge a logical value (\code{FALSE} by default) indicating whether a hinge model (Fong et al., 2017) shall be used for modeling the effect of \eqn{S(z)} on the
#' clinical endpoint risk. A hinge model specifies that variability in \eqn{S(z)} below the hinge point does not associate with the clinical endpoint risk.
#' @param weights either a numeric vector of weights or a character string specifying the variable name in \code{data} representing weights applied to observations
#' in the phase 2 subset in order to make inference about the target population of all randomized participants endpoint-free at \eqn{t_0}. The weights reflect that
#' the case:control ratio in the phase 2 subset is different from that in the target population and are passed on to GLMs in the estimation of the hinge point.
#' If \code{NULL} (default), weights for cases and controls are calculated separately in each study group.
#' @param psGrid a numeric vector of \eqn{S(1)} values at which the conditional clinical endpoint risk in each study group is estimated. If \code{NULL} (default),
#' a grid of values spanning the range of observed values of the biomarker will be used.
#' @param saveFile a character string specifying the name of an \code{.RData} file storing the output list. If \code{NULL} (default), the output list will only be returned.
#' @param saveDir a character string specifying a path for the output directory. If \code{NULL} (default), the output list will only be returned; otherwise, if
#' \code{saveFile} is specified, the output list will also be saved as an \code{.RData} file in the specified directory.
#'
#' @return If \code{saveFile} and \code{saveDir} are both specified, the output list (named \code{oList}) is saved as an \code{.RData} file; otherwise it is returned only.
#' The output object (of class \code{riskCurve}) is a list with the following components:
#' \itemize{
#' \item \code{psGrid}: a numeric vector of \eqn{S(1)} values at which the conditional clinical endpoint risk is estimated in the components \code{plaRiskCurve} and
#' \code{txRiskCurve}
#' \item \code{plaRiskCurve}: a numeric vector of estimates of \eqn{P\{Y(0)=1|S(1)=s_1\}} for \eqn{s_1} in \code{psGrid}
#' \item \code{txRiskCurve}: a numeric vector of estimates of \eqn{P\{Y(1)=1|S(1)=s_1\}} for \eqn{s_1} in \code{psGrid}
#' \item \code{fOptBandwidths}: a \code{conbandwidth} object returned by the call of the function \code{npcdensbw} containing the optimal bandwidths, selected by likelihood
#' cross-validation, in the kernel estimation of the conditional density of \eqn{S(1)} given the baseline surrogate measure and any other specified baseline covariates
#' \item \code{gOptBandwidths}: a \code{conbandwidth} object returned by the call of the function \code{npcdensbw} or \code{npudensbw} containing the optimal bandwidths,
#' selected by likelihood cross-validation, in the kernel estimation of the conditional density of \eqn{S(0)} given any specified baseline covariates or the marginal density
#' of \eqn{S(0)} if no baseline covariates are specified in \code{formula}
#' \item \code{cpointP}: if \code{hinge=TRUE}, the estimate of the hinge point in the placebo group
#' \item \code{cpointT}: if \code{hinge=TRUE}, the estimate of the hinge point in the treatment group
#' }
#'
#' @references Fong, Y., Huang, Y., Gilbert, P. B., and Permar, S. R. (2017), chngpt: threshold regression model estimation and inference, \emph{BMC Bioinformatics}, 18.
#'
#' Hall, P., Racine, J., and Li, Q. (2004), Cross-validation and the estimation of conditional probability densities, \emph{JASA} 99(468), 1015-1026.
#'
#' Juraska, M., Huang, Y., and Gilbert, P. B. (2020), Inference on treatment effect modification by biomarker response in a three-phase sampling design, Biostatistics, 21(3): 545-560, \url{https://doi.org/10.1093/biostatistics/kxy074}.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' X <- rbinom(n,1,0.5)
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.4)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(X,Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("X","Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=3)
#'
#' out <- riskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data, psGrid=grid)
#' \donttest{
#' # alternatively, to save the .RData output file (no '<-' needed):
#' riskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data, saveFile="out.RData",
#'           saveDir="./")
#' }
#'
#' @seealso \code{\link{bootRiskCurve}}, \code{\link{summary.riskCurve}} and \code{\link{plotMCEPcurve}}
#' @export
riskCurve <- function(formula, bsm, tx, data, pstype=c("continuous", "ordered"), bsmtype=c("continuous", "ordered"), bwtype=c("fixed", "generalized_nn", "adaptive_nn"),
                      hinge=FALSE, weights=NULL, psGrid=NULL, saveFile=NULL, saveDir=NULL){
  if (missing(bsm)){ stop("The variable name in argument 'bsm' for the baseline surrogate measure is missing.") }
  if (missing(tx)){ stop("The variable name in argument 'tx' for the treatment group indicator is missing.") }
  if (missing(data)){ stop("The data frame 'data' for interpreting the variables in 'formula' is missing.") }

  pstype <- match.arg(pstype)
  bsmtype <- match.arg(bsmtype)
  bwtype <- match.arg(bwtype)

  if (!is.null(weights)){
    if (is.character(weights)){
      colnames(data)[colnames(data)==weights] <- "weights"
    } else {
      # assuming 'weights' is a numeric vector
      data$weights <- weights
    }
  }

  formulaDecomp <- strsplit(strsplit(paste(deparse(formula), collapse = ""), " *[~] *")[[1]], " *[+] *")
  anyBaselineCovar <- length(formulaDecomp[[2]])>1

  # convert any baseline covariates into factors
  if (anyBaselineCovar){
    # 'vars' is of length >= 3
    vars <- all.vars(formula)[-(1:2)]
    for (i in 1:length(vars)){
      data[,vars[i]] <- as.factor(data[,vars[i]])
    }
  }

  # standardize the variable name for the treatment group indicator
  colnames(data)[colnames(data)==tx] <- "Z"

  # standardize the variable name for the biomarker measurement at fixed time t0 post-randomization
  # this line assumes that it is the first listed variable on the RHS of 'formula'
  colnames(data)[colnames(data)==formulaDecomp[[2]][1]] <- "S"

  # standardize the variable name for the biomarker's baseline measurement
  colnames(data)[colnames(data)==bsm] <- "Sb"

  # standardize the variable name for the binary clinical endpoint measured after t0
  colnames(data)[colnames(data)==formulaDecomp[[1]]] <- "Y"

  # extract the subset with available phase 2 data (i.e., with measured S)
  data2 <- subset(data, !is.na(S))

  # case deletion method applied to 'data2' to match the case:control ratio in 'data', separately in each study group,
  # in order to be able to use the standard kernel density estimation procedure
  dataControls <- subset(data, Y==0)
  nPControls <- NROW(subset(dataControls, Z==0))
  nTControls <- NROW(subset(dataControls, Z==1))

  nPControls2 <- NROW(dataPControls2 <- subset(data2, Z==0 & Y==0))
  nTControls2 <- NROW(dataTControls2 <- subset(data2, Z==1 & Y==0))

  dataCases <- subset(data, Y==1)
  nPCases <- NROW(subset(dataCases, Z==0))
  nTCases <- NROW(subset(dataCases, Z==1))

  nPCases2 <- NROW(dataPCases2 <- subset(data2, Z==0 & Y==1))
  nTCases2 <- NROW(dataTCases2 <- subset(data2, Z==1 & Y==1))

  # in each study group separately, calculate the required number of cases in 'data2' to match the case:control ratio in 'data'
  nPCases2new <- nPCases * nPControls2 / nPControls
  nTCases2new <- nTCases * nTControls2 / nTControls

  # in each study group separately, randomly sample cases to match the case:control ratio in 'data'
  dataP2correctRatio <- rbind(dataPControls2, dataPCases2[sample(1:nPCases2, max(1,round(nPCases2new,0))),])
  dataT2correctRatio <- rbind(dataTControls2, dataTCases2[sample(1:nTCases2, max(1,round(nTCases2new,0))),])
  rm(dataPControls2); rm(dataPCases2); rm(dataTControls2); rm(dataTCases2)

  # estimate optimal bandwidths for kernel estimates of the conditional (or unconditional) densities
  Sterm <- "S"
  if (pstype=="ordered"){ Sterm <- "ordered(S)"}
  Sbterm <- "Sb"
  if (bsmtype=="ordered"){ Sbterm <- "ordered(Sb)"}

  if (anyBaselineCovar){
    fm.fbw <- as.formula(paste0(Sterm," ~ ",paste(c(Sbterm,formulaDecomp[[2]][-1]),collapse="+")))
    fm.gbw <- as.formula(paste0(Sterm," ~ ",paste(formulaDecomp[[2]][-1],collapse="+")))
    gbw <- npcdensbw(fm.gbw, data=dataP2correctRatio, cxkertype="epanechnikov", cykertype="epanechnikov", bwtype=bwtype)
  } else {
    fm.fbw <- as.formula(paste0(Sterm," ~ ",Sbterm))
    fm.gbw <- as.formula(paste0("~ ",Sterm))
    # marginal density
    gbw <- npudensbw(fm.gbw, data=dataP2correctRatio, ckertype="epanechnikov", bwtype=bwtype)
  }
  fbw <- npcdensbw(fm.fbw, data=dataT2correctRatio, cxkertype="epanechnikov", cykertype="epanechnikov", bwtype=bwtype)

  if (hinge){
    # calculate weights for passing on to 'chngptm'
    if (is.null(weights) & !("weights" %in% colnames(data))){
      data2$weights <- NA
      data2$weights <- ifelse(data2$Z==0 & data2$Y==0, nPControls/nPControls2, data2$weights)
      data2$weights <- ifelse(data2$Z==1 & data2$Y==0, nTControls/nTControls2, data2$weights)
      data2$weights <- ifelse(data2$Z==0 & data2$Y==1, nPCases/nPCases2, data2$weights)
      data2$weights <- ifelse(data2$Z==1 & data2$Y==1, nTCases/nTCases2, data2$weights)
    }

    # in each study group separately, estimate the hinge point for the association of S with Y
    if (anyBaselineCovar){
      fm <- as.formula(paste0("Y ~ ",paste(formulaDecomp[[2]][-1],collapse="+")))
    } else {
      fm <- Y ~ 1
    }
    cpointP <- chngptm(formula.1=fm, formula.2=~S, data=subset(data2, Z==0 & !is.na(Y)), family="binomial", type="hinge", weights=subset(data2, Z==0 & !is.na(Y))$weights)$coefficients["chngpt"]
    cpointT <- chngptm(formula.1=Y ~ 1, formula.2=~S, data=subset(data2, Z==1 & !is.na(Y)), family="binomial", type="hinge", weights=subset(data2, Z==1 & !is.na(Y))$weights)$coefficients["chngpt"]
    # use their minimum as the hinge point in the below specified GLMs
    cpoint <- min(cpointP, cpointT)

    # biomarker S left-censored at 'cpoint' for use in the below specified GLMs
    data$Sc <- with(data, ifelse(S>cpoint, S-cpoint, 0))

    # re-extract the subset with phase 2 data
    data2 <- subset(data, !is.na(S))

    # IPW logistic regression model fitted to placebo recipients in the phase 2 subset accounting for two-phase sampling of S
    if (anyBaselineCovar){
      fm <- as.formula(paste0("Y ~ ",paste(c("Sc",formulaDecomp[[2]][-1]),collapse="+")))
    } else {
      fm <- Y ~ Sc
    }
    fit1 <- tps(fm, data=subset(data2, Z==0 & !is.na(Y)), nn0=nPControls, nn1=nPCases, group=rep(1, NROW(subset(data2, Z==0 & !is.na(Y)))), method="PL", cohort=TRUE)
  } else {
    # for passing on to the function 'risk'
    cpoint <- NULL

    # IPW logistic regression model fitted to placebo recipients in the phase 2 subset accounting for two-phase sampling of S
    if (anyBaselineCovar){
      fm <- as.formula(paste0("Y ~ ",paste(c("S",formulaDecomp[[2]][-1]),collapse="+")))
    } else {
      fm <- Y ~ S
    }
    fit1 <- tps(fm, data=subset(data2, Z==0 & !is.na(Y)), nn0=nPControls, nn1=nPCases, group=rep(1, NROW(subset(data2, Z==0 & !is.na(Y)))), method="PL", cohort=TRUE)
  }

  # kernel density estimator for f(s1|Sb=s0, X=x) using the treatment group in the phase 2 subset
  fhat <- npcdens(fbw)

  # kernel density estimator for g(s0|X=x) using the placebo group in the phase 2 subset
  if (anyBaselineCovar){
    ghat <- npcdens(gbw)
  } else {
    ghat <- npudens(gbw)
  }

  # a grid of values of S on which the estimated risk curves are returned
  if (is.null(psGrid)){
    psGrid <- seq(min(data2$S), max(data2$S), length.out=200)
  }

  # the first argument of 'risk' is a scalar
  if (anyBaselineCovar){
    fm <- as.formula(paste0("~",paste(formulaDecomp[[2]][-1],collapse="+")))
  } else {
    fm <- ~ 1
  }
  curves <- lapply(psGrid, function(s){ risk(s, fm, data, data2, pstype, bsmtype, fit1, fhat, ghat, cpoint) })
  plaRiskCurve <- sapply(curves, "[[", "plaRisk")
  txRiskCurve <- sapply(curves, "[[", "txRisk")

  # the output list
  oList <- list(psGrid=psGrid, plaRiskCurve=plaRiskCurve, txRiskCurve=txRiskCurve, fOptBandwidths=fbw, gOptBandwidths=gbw)
  if (hinge){
    oList$cpointP <- cpointP
    oList$cpointT <- cpointT
  }
  class(oList) <- "riskCurve"

  if (!is.null(saveFile) & !is.null(saveDir)){
    save(oList, file=file.path(saveDir, saveFile))
    cat("Output saved in:\n",file.path(saveDir, saveFile),"\n")
  }

  return(invisible(oList))
}

# 'object' is the output list from either 'riskCurve' or 'bootRiskCurve'
contrastRiskCurve <- function(object, contrast){
  if (contrast=="te"){ return(1 - object$txRisk/object$plaRisk) }
  if (contrast=="rr"){ return(object$txRisk/object$plaRisk) }
  if (contrast=="logrr"){ return(log(object$txRisk/object$plaRisk)) }
  if (contrast=="rd"){ return(object$plaRisk - object$txRisk) }
}

# 'object' is the output list from either 'riskCurve' or 'bootRiskCurve'
tContrastRiskCurve <- function(object, contrast){
  if (contrast %in% c("te","rr","logrr")){ return(log(object$txRisk/object$plaRisk)) }
  if (contrast=="rd"){ return(object$plaRisk - object$txRisk) }
}

# 'x' is a numeric vector
invtContrastRiskCurve <- function(x, contrast){
  if (contrast=="te"){ return(1 - exp(x)) }
  if (contrast=="rr"){ return(exp(x)) }
  if (contrast=="logrr"){ return(x) }
  if (contrast=="rd"){ return(x) }
}

#' Summary of Point and Interval Estimation of a Marginal Causal Effect Predictiveness Curve
#'
#' Summarizes point estimates and pointwise and simultaneous Wald-type bootstrap confidence intervals for a specified marginal causal effect predictiveness (mCEP) curve (see,
#' e.g., Juraska, Huang, and Gilbert (2018) for the definition).
#'
#' @param object an object of class \code{riskCurve}, typically returned by \code{\link{riskCurve}}
#' @param boot an object returned by \code{\link{bootRiskCurve}}. If \code{NULL} (default), only point estimates are reported.
#' @param contrast a character string specifying the mCEP curve. It must be one of \code{te} (treatment efficacy), \code{rr} (relative risk), \code{logrr} (log relative risk), and \code{rd} (risk
#' difference [placebo minus treatment]).
#' @param confLevel the confidence level of pointwise and simultaneous confidence intervals
#' @param \dots for other methods
#'
#' @return A data frame containing point and possibly interval estimates of the specified mCEP curve.
#'
#' @references Juraska, M., Huang, Y., and Gilbert, P. B. (2020), Inference on treatment effect modification by biomarker response in a three-phase sampling design, Biostatistics, 21(3): 545-560, \url{https://doi.org/10.1093/biostatistics/kxy074}.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.4)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=2)
#'
#' out <- riskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data, psGrid=grid)
#' boot <- bootRiskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data,
#'                       psGrid=grid, iter=2, seed=10)
#' summary(out, boot, contrast="te")
#'
#' @seealso \code{\link{riskCurve}} and \code{\link{bootRiskCurve}}
#' @export
summary.riskCurve <- function(object, boot=NULL, contrast=c("te", "rr", "logrr", "rd"), confLevel=0.95,...){
  contrast <- match.arg(contrast)

  # point estimates of mCEP(s1)
  MCEP <- contrastRiskCurve(object, contrast)
  out <- data.frame(object$psGrid, MCEP)
  colnames(out) <- c("psGrid", contrast)

  # interval estimates of mCEP(s1)
  if(!is.null(boot)){
    # transformed MCEP curve
    tMCEP <- tContrastRiskCurve(object, contrast)
    # transformed bootstrapped MCEP curves
    # assuming the matrices have the same dimensions
    tbMCEP <- tContrastRiskCurve(boot, contrast)

    # bootstrap SE of tMCEP estimates
    bSE <- apply(tbMCEP, 1, sd, na.rm=TRUE)

    # pointwise confidence bounds for MCEP(s1)
    ptLB.MCEP <- invtContrastRiskCurve(tMCEP - qnorm(1-(1-confLevel)/2) * bSE, contrast=contrast)
    ptUB.MCEP <- invtContrastRiskCurve(tMCEP + qnorm(1-(1-confLevel)/2) * bSE, contrast=contrast)

    supAbsZ <- NULL
    for (j in 1:NCOL(tbMCEP)){
      Zstat <- abs((tbMCEP[,j]-tMCEP)/bSE)
      supAbsZ <- c(supAbsZ, max(Zstat, na.rm=!all(is.na(Zstat))))
    }
    qSupAbsZ <- quantile(supAbsZ, probs=confLevel, na.rm=TRUE)

    smLB.MCEP <- invtContrastRiskCurve(tMCEP - qSupAbsZ * bSE, contrast=contrast)
    smUB.MCEP <- invtContrastRiskCurve(tMCEP + qSupAbsZ * bSE, contrast=contrast)

    if (contrast=="te"){
      tmp <- ptUB.MCEP
      ptUB.MCEP <- ptLB.MCEP
      ptLB.MCEP <- tmp

      tmp <- smUB.MCEP
      smUB.MCEP <- smLB.MCEP
      smLB.MCEP <- tmp
    }

    out$ptLB <- ptLB.MCEP
    out$ptUB <- ptUB.MCEP
    out$smLB <- smLB.MCEP
    out$smUB <- smUB.MCEP
  }

  return(out)
}

#' Testing of the Null Hypotheses of a Flat and a Constant Marginal Causal Effect Predictiveness Curve
#'
#' Computes a two-sided p-value either from the test of \{\eqn{H_0^1: mCEP(s_1)=CE} for all \eqn{s_1}\}, where \eqn{CE} is the overall causal treatment effect on the clinical
#' endpoint, or from the test of \{\eqn{H_0^2: mCEP(s_1)=c} for all \eqn{s_1} in the interval \code{limS1} and a specified constant \eqn{c}\}, each against a general alternative
#' hypothesis. The testing procedures are described in Juraska, Huang, and Gilbert (2018) and are based on the simultaneous estimation method of Roy and Bose (1953).
#'
#' @param object an object returned by \code{\link{riskCurve}}
#' @param boot an object returned by \code{\link{bootRiskCurve}}
#' @param contrast a character string specifying the mCEP curve. It must be one of \code{te} (treatment efficacy), \code{rr} (relative risk), \code{logrr}
#' (log relative risk), and \code{rd} (risk difference [placebo minus treatment]).
#' @param null a character string specifying the null hypothesis to be tested; one of \code{H01} and \code{H02} as introduced above
#' @param overallPlaRisk a numeric value of the estimated overall clinical endpoint risk in the placebo group. It is required when \code{null} equals \code{H01}.
#' @param overallTxRisk a numeric value of the estimated overall clinical endpoint risk in the treatment group. It is required when \code{null} equals \code{H01}.
#' @param MCEPconstantH02 the constant \eqn{c} in the null hypothesis \eqn{H_0^2}. It is required when \code{null} equals \code{H02}.
#' @param limS1 a numeric vector of length 2 specifying an interval that is a subset of the support of \eqn{S(1)} and that is used in the evaluation of the null hypothesis
#' \eqn{H_0^2}. If \code{NULL} (default), then \eqn{H_0^2} is evaluated for all \eqn{s_1}.
#'
#' @return A numeric value representing the two-sided p-value from the test of either \eqn{H_0^1} or \eqn{H_0^2}.
#'
#' @references Juraska, M., Huang, Y., and Gilbert, P. B. (2020), Inference on treatment effect modification by biomarker response in a three-phase sampling design, Biostatistics, 21(3): 545-560, \url{https://doi.org/10.1093/biostatistics/kxy074}.
#'
#' Roy, S. N. and Bose, R. C. (1953), Simultaneous condence interval estimation, \emph{The Annals of Mathematical Statistics}, 24, 513-536.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' X <- rbinom(n,1,0.5)
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.4)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(X,Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("X","Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=3)
#' \donttest{
#' out <- riskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data, psGrid=grid)
#' boot <- bootRiskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data,
#'                       psGrid=grid, iter=2, seed=10)
#' fit <- glm(Y ~ Z, data=data, family=binomial)
#' prob <- predict(fit, newdata=data.frame(Z=0:1), type="response")
#'
#' testConstancy(out, boot, contrast="te", null="H01", overallPlaRisk=prob[1],
#'               overallTxRisk=prob[2])
#' testConstancy(out, boot, contrast="te", null="H02", MCEPconstantH02=0, limS1=c(qS[1],1.5))
#' }
#'
#' @seealso \code{\link{riskCurve}}, \code{\link{bootRiskCurve}} and \code{\link{testEquality}}
#' @export
testConstancy <- function(object, boot, contrast=c("te", "rr", "logrr", "rd"), null=c("H01", "H02"), overallPlaRisk=NULL, overallTxRisk=NULL, MCEPconstantH02=NULL, limS1=NULL){
  contrast <- match.arg(contrast)
  null <- match.arg(null)

  if (null=="H01"){
    if (is.null(overallPlaRisk) | is.null(overallTxRisk)){ stop("'overallPlaRisk' and 'overallTxRisk' must be specified for the test of H01.") }
  }

  if (null=="H02"){
    if (is.null(MCEPconstantH02)){ stop("'MCEPconstantH02' must be specified for the test of H02.") }

    # trim the risk curves if 'limS1' is specified
    if (!is.null(limS1)){
      object$plaRiskCurve <- object$plaRiskCurve[object$psGrid>=limS1[1] & object$psGrid<=limS1[2]]
      object$txRiskCurve <- object$txRiskCurve[object$psGrid>=limS1[1] & object$psGrid<=limS1[2]]
      boot$plaRiskCurveBoot <- boot$plaRiskCurveBoot[boot$psGrid>=limS1[1] & boot$psGrid<=limS1[2],,drop=FALSE]
      boot$txRiskCurveBoot <- boot$txRiskCurveBoot[boot$psGrid>=limS1[1] & boot$psGrid<=limS1[2],,drop=FALSE]
    }
  }

  # transformed estimated MCEP curve
  tMCEP <- tContrastRiskCurve(object, contrast)
  # transformed bootstrapped MCEP curves
  tbMCEP <- tContrastRiskCurve(boot, contrast)

  # bootstrap SE based on tbMCEP estimates
  bSE <- apply(tbMCEP, 1, sd, na.rm=TRUE)

  # calculate the supremum statistic for each bootstrap sample
  supAbsZ <- NULL
  for (j in 1:NCOL(tbMCEP)){
    Zstat <- abs((tbMCEP[,j]-tMCEP)/bSE)
    supAbsZ <- c(supAbsZ, max(Zstat, na.rm=!all(is.na(Zstat))))
  }

  if (null=="H01"){
    tCEest <- ifelse(contrast=="rd", overallPlaRisk - overallTxRisk, log(overallTxRisk/overallPlaRisk))
    testStat <- max(abs(tMCEP-tCEest)/bSE, na.rm=TRUE)
    return(mean(supAbsZ > testStat))
  }

  if (null=="H02"){
    tMCEPconstantH02 <- switch(contrast, te=log(1-MCEPconstantH02), rr=log(MCEPconstantH02), logrr=MCEPconstantH02, rd=MCEPconstantH02)
    testStat <- max(abs(tMCEP-tMCEPconstantH02)/bSE, na.rm=TRUE)
    return(mean(supAbsZ > testStat))
  }
}

#' Testing of the Null Hypothesis of Equal Marginal Causal Effect Predictiveness Curves for Two Biomarkers, Endpoints, or Baseline Covariate Subgroups
#'
#' Computes a two-sided p-value either from the test of \{\eqn{H_0^3: mCEP_1(s_1)=mCEP_2(s_1)} for all \eqn{s_1} in \code{limS1}\}, where \eqn{mCEP_1} and \eqn{mCEP_2} are
#' each associated with either a different biomarker (measured in the same units) or a different endpoint or both, or from the test of \{\eqn{H_0^4: mCEP(s_1|X=0)=
#' mCEP(s_1|X=1)} for all \eqn{s_1} in \code{limS1}\}, where \eqn{X} is a baseline dichotomous phase 1 covariate of interest, each against a general alternative
#' hypothesis. The testing procedures are described in Juraska, Huang, and Gilbert (2018) and are based on the simultaneous estimation method of Roy and Bose (1953).
#'
#' @param object1 an object returned by \code{\link{riskCurve}} pertaining to either \eqn{mCEP_1(s_1)} in \eqn{H_0^3} or \eqn{mCEP(s1|X=0)} in \eqn{H_0^4}
#' @param object2 an object returned by \code{\link{riskCurve}} pertaining to either \eqn{mCEP_2(s_1)} in \eqn{H_0^3} or \eqn{mCEP(s1|X=1)} in \eqn{H_0^4}
#' @param boot1 an object returned by \code{\link{bootRiskCurve}} pertaining to either \eqn{mCEP_1(s_1)} in \eqn{H_0^3} or \eqn{mCEP(s1|X=0)} in \eqn{H_0^4}
#' @param boot2 an object returned by \code{\link{bootRiskCurve}} pertaining to either \eqn{mCEP_2(s_1)} in \eqn{H_0^3} or \eqn{mCEP(s1|X=1)} in \eqn{H_0^4}
#' @param contrast a character string specifying the mCEP curve. It must be one of \code{te} (treatment efficacy), \code{rr} (relative risk), \code{logrr}
#' (log relative risk), and \code{rd} (risk difference [placebo minus treatment]).
#' @param null a character string specifying the null hypothesis to be tested; one of \code{H03} and \code{H04} as introduced above
#' @param limS1 a numeric vector of length 2 specifying an interval that is a subset of the support of \eqn{S(1)}. If \code{NULL} (default), then the specified null
#' hypothesis is evaluated for all \eqn{s_1}.
#'
#' @return A numeric value representing the two-sided p-value from the test of either \eqn{H_0^3} or \eqn{H_0^4}.
#'
#' @references Juraska, M., Huang, Y., and Gilbert, P. B. (2020), Inference on treatment effect modification by biomarker response in a three-phase sampling design, Biostatistics, 21(3): 545-560, \url{https://doi.org/10.1093/biostatistics/kxy074}.
#'
#' Roy, S. N. and Bose, R. C. (1953), Simultaneous condence interval estimation, \emph{The Annals of Mathematical Statistics}, 24, 513-536.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' X <- rbinom(n,1,0.5)
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.4)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(X,Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("X","Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=3)
#' out0 <- riskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data[data$X==0,], psGrid=grid)
#' out1 <- riskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data[data$X==1,], psGrid=grid)
#' boot0 <- bootRiskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data[data$X==0,],
#'                        psGrid=grid, iter=2, seed=10)
#' boot1 <- bootRiskCurve(formula=Y ~ S, bsm="Sb", tx="Z", data=data[data$X==1,],
#'                        psGrid=grid, iter=2, seed=15)
#'
#' testEquality(out0, out1, boot0, boot1, contrast="te", null="H04")
#'
#' @seealso \code{\link{riskCurve}}, \code{\link{bootRiskCurve}} and \code{\link{testConstancy}}
#' @export
testEquality <- function(object1, object2, boot1, boot2, contrast=c("te", "rr", "logrr", "rd"), null=c("H03", "H04"), limS1=NULL){
  contrast <- match.arg(contrast)
  null <- match.arg(null)

  if (!setequal(object1$psGrid, object2$psGrid)){ stop("The estimated risk curves in 'object1' and 'object2' must be evaluated on the same grid of biomarker values.") }
  if (!setequal(boot1$psGrid, boot2$psGrid)){ stop("The bootstrapped risk curves in 'boot1' and 'boot2' must be evaluated on the same grid of biomarker values.") }

  # trim the risk curves if 'limS1' is specified
  if (!is.null(limS1)){
    object1$plaRiskCurve <- object1$plaRiskCurve[object1$psGrid>=limS1[1] & object1$psGrid<=limS1[2]]
    object1$txRiskCurve <- object1$txRiskCurve[object1$psGrid>=limS1[1] & object1$psGrid<=limS1[2]]
    object2$plaRiskCurve <- object2$plaRiskCurve[object2$psGrid>=limS1[1] & object2$psGrid<=limS1[2]]
    object2$txRiskCurve <- object2$txRiskCurve[object2$psGrid>=limS1[1] & object2$psGrid<=limS1[2]]

    boot1$plaRiskCurveBoot <- boot1$plaRiskCurveBoot[boot1$psGrid>=limS1[1] & boot1$psGrid<=limS1[2],,drop=FALSE]
    boot1$txRiskCurveBoot <- boot1$txRiskCurveBoot[boot1$psGrid>=limS1[1] & boot1$psGrid<=limS1[2],,drop=FALSE]
    boot2$plaRiskCurveBoot <- boot2$plaRiskCurveBoot[boot2$psGrid>=limS1[1] & boot2$psGrid<=limS1[2],,drop=FALSE]
    boot2$txRiskCurveBoot <- boot2$txRiskCurveBoot[boot2$psGrid>=limS1[1] & boot2$psGrid<=limS1[2],,drop=FALSE]
  }

  # transformed estimated mCEP curves
  tMCEP1 <- tContrastRiskCurve(object1, contrast)
  tMCEP2 <- tContrastRiskCurve(object2, contrast)
  # transformed bootstrapped mCEP curves
  tbMCEP1 <- tContrastRiskCurve(boot1, contrast)
  tbMCEP2 <- tContrastRiskCurve(boot2, contrast)

  # difference in the transformed mCEP curves
  difftMCEP <- tMCEP1 - tMCEP2
  difftbMCEP <- tbMCEP1 - tbMCEP2

  # bootstrap SE based on tbMCEP estimates
  if (null=="H03"){
    bSE <- apply(difftbMCEP, 1, sd, na.rm=TRUE)
  } else {
    # take advantage of the independence
    bSE <- sqrt(apply(tbMCEP1, 1, var, na.rm=TRUE) + apply(tbMCEP2, 1, var, na.rm=TRUE))
  }

  # calculate the supremum statistic for each bootstrap sample
  supAbsZ <- NULL
  for (j in 1:NCOL(difftbMCEP)){
    Z <- abs((difftbMCEP[,j]-difftMCEP)/bSE)
    supAbsZ <- c(supAbsZ, max(Z, na.rm=!all(is.na(Z))))
  }

  testStat <- max(abs(difftMCEP)/bSE, na.rm=TRUE)
  return(mean(supAbsZ > testStat))
}

#' Plotting of the Estimated Marginal Causal Effect Predictiveness Curve
#'
#' Plots point estimates and, if available, pointwise and simultaneous Wald-type bootstrap confidence intervals for the specified marginal causal effect predictiveness (mCEP)
#' curve.
#'
#' @param object an object returned by \code{\link{summary.riskCurve}}
#' @param confLevel the confidence level (0.95 by default) of pointwise and simultaneous confidence intervals
#' @param hingePoint the hinge point estimate (\code{NULL} by default)
#' @param title a character string specifying the plot title
#' @param xLab a character string specifying the x-axis label (\code{NULL} by default)
#' @param yLab a character string specifying the y-axis label (\code{NULL} by default)
#' @param yLim a numeric vector of length 2 specifying the y-axis range (\code{NULL} by default)
#' @param pType a character string specifying the type of plot. Possible options are \code{"l"} for lines (default) and \code{"p"} for points.
#'
#' @return None. The function is called solely for plot generation.
#'
#' @examples
#' n <- 500
#' Z <- rep(0:1, each=n/2)
#' S <- MASS::mvrnorm(n, mu=c(2,2,3), Sigma=matrix(c(1,0.9,0.7,0.9,1,0.7,0.7,0.7,1), nrow=3))
#' p <- pnorm(drop(cbind(1,Z,(1-Z)*S[,2],Z*S[,3]) %*% c(-1.2,0.2,-0.02,-0.2)))
#' Y <- sapply(p, function(risk){ rbinom(1,1,risk) })
#' X <- rbinom(n,1,0.5)
#' # delete S(1) in placebo recipients
#' S[Z==0,3] <- NA
#' # delete S(0) in treatment recipients
#' S[Z==1,2] <- NA
#' # generate the indicator of being sampled into the phase 2 subset
#' phase2 <- rbinom(n,1,0.3)
#' # delete Sb, S(0) and S(1) in controls not included in the phase 2 subset
#' S[Y==0 & phase2==0,] <- c(NA,NA,NA)
#' # delete Sb in cases not included in the phase 2 subset
#' S[Y==1 & phase2==0,1] <- NA
#' data <- data.frame(X,Z,S[,1],ifelse(Z==0,S[,2],S[,3]),Y)
#' colnames(data) <- c("X","Z","Sb","S","Y")
#' qS <- quantile(data$S, probs=c(0.05,0.95), na.rm=TRUE)
#' grid <- seq(qS[1], qS[2], length.out=3)
#' \donttest{
#' out <- riskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data, psGrid=grid)
#' boot <- bootRiskCurve(formula=Y ~ S + factor(X), bsm="Sb", tx="Z", data=data,
#'                       psGrid=grid, iter=2, seed=10)
#' sout <- summary(out, boot, contrast="te")
#' plotMCEPcurve(sout)
#' }
#'
#' @seealso \code{\link{riskCurve}}, \code{\link{bootRiskCurve}} and \code{\link{summary.riskCurve}}
#' @export
plotMCEPcurve <- function(object, confLevel=0.95, hingePoint=NULL, title=NULL, xLab=NULL, yLab=NULL, yLim=NULL, pType=c("l","p")){
  pType <- match.arg(pType)
  
  cexTitle <- 1.6
  cexLab <- 1.4
  cexAxis <- 1.3
  cexLegend <- 1.2

  if (is.null(yLim)){ 
    yLim <- range(object[,-1], na.rm=TRUE) 
    yLim <- c(yLim[1]-0.1*(yLim[2]-yLim[1]), yLim[2])
  }
  if (is.null(xLab)){ xLab <- expression(paste("Biomarker Response at ",t[0])) }
  if (is.null(yLab)){ yLab <- switch(colnames(object)[2], te="Treatment Efficacy", rr="Relative Risk", logrr="Log Relative Risk", rd="Risk Difference (Pla - Tx)") }

  par(mar=c(5,5,2,1), cex.lab=cexLab, cex.axis=cexAxis, las=1)

  plot(object[,1], object[,2], type="n", ylim=yLim, xlab=xLab, ylab=yLab)
  if (!is.null(title)){ mtext(title, side=3, line=0.5, cex=cexTitle) }
  abline(h=ifelse(colnames(object)[2]=="rr", 1, 0), lty="dotted", lwd=2, col="gray50")

  if (pType=="l"){
    lines(object[,1], object[,2], lwd=3.5)
    
    # if interval estimates are available in the data frame
    if (NCOL(object) > 2){
      lines(object[,1], object$ptLB, lty="dashed", lwd=3)
      lines(object[,1], object$ptUB, lty="dashed", lwd=3)
      lines(object[,1], object$smLB, lty="dotdash", lwd=3)
      lines(object[,1], object$smUB, lty="dotdash", lwd=3)
      
      legend("bottomleft", lty=c("dashed","dotdash"), lwd=3, legend=c(paste0("Pointwise ",confLevel*100,"% CI"), paste0("Simultaneous ",confLevel*100,"% CI")),
             cex=cexLegend, bty="n")
    }
  } else {
    points(object[,1], object[,2], lwd=3.5, pch=16, cex=1.6)
    
    # if interval estimates are available in the data frame
    if (NCOL(object) > 2){
      points(object[,1], object$ptLB, lty="dashed", lwd=3, pch=1)
      points(object[,1], object$ptUB, lty="dashed", lwd=3, pch=1)
      points(object[,1], object$smLB, lty="dotdash", lwd=3, pch=4)
      points(object[,1], object$smUB, lty="dotdash", lwd=3, pch=4)
      
      legend("bottomleft", pch=c(1,4), legend=c(paste0("Pointwise ",confLevel*100,"% CI"), paste0("Simultaneous ",confLevel*100,"% CI")),
             cex=cexLegend, bty="n")
    }
  }
  
  if (!is.null(hingePoint)){ legend("bottomright", paste0("Hinge Point = ", round(hingePoint,2),"  "), cex=cexLegend, bty="n") }
}

npcdensbw.formula <- function (formula, data, subset, na.action, call, ...){
  orig.class <- if (missing(data))
    sapply(eval(attr(terms(formula), "variables"), environment(formula)),
           class)
  else sapply(eval(attr(terms(formula), "variables"), data,
                   environment(formula)), class)
  mf <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "na.action"), names(mf),
             nomatch = 0)
  mf <- mf[c(1, m)]
  if (!missing(call) && is.call(call)) {
    for (i in 1:length(call)) {
      if (tryCatch(class(eval(call[[i]])) == "formula",
                   error = function(e) FALSE))
        break
    }
    mf[[2]] <- call[[i]]
  }
  mf[[1]] <- as.name("model.frame")
  # patch
  #if (m[2] > 0) { # use data as environment
  #  mf[["formula"]] = eval(mf[[m[1]]], environment(mf[[m[2]]]))
  #} else { # use parent frame
  mf[["formula"]] = eval(mf[[m[1]]], parent.frame())
  #}
  # end of patch
  variableNames <- explodeFormula(mf[["formula"]])
  varsPlus <- lapply(variableNames, paste, collapse = " + ")
  mf[["formula"]] <- as.formula(paste(" ~ ", varsPlus[[1]],
                                      " + ", varsPlus[[2]]), env = environment(formula))
  mf[["formula"]] <- terms(mf[["formula"]])
  if (all(orig.class == "ts")) {
    args <- (as.list(attr(mf[["formula"]], "variables"))[-1])
    attr(mf[["formula"]], "predvars") <- as.call(c(quote(as.data.frame),
                                                   as.call(c(quote(ts.intersect), args))))
  }
  else if (any(orig.class == "ts")) {
    arguments <- (as.list(attr(mf[["formula"]], "variables"))[-1])
    arguments.normal <- arguments[which(orig.class != "ts")]
    arguments.timeseries <- arguments[which(orig.class ==
                                              "ts")]
    ix <- sort(c(which(orig.class == "ts"), which(orig.class !=
                                                    "ts")), index.return = TRUE)$ix
    attr(mf[["formula"]], "predvars") <- bquote(.(as.call(c(quote(cbind),
                                                            as.call(c(quote(as.data.frame), as.call(c(quote(ts.intersect),
                                                                                                      arguments.timeseries)))), arguments.normal, check.rows = TRUE)))[,
                                                                                                                                                                       .(ix)])
  }
  mf <- eval(mf, parent.frame())
  ydat <- mf[, variableNames[[1]], drop = FALSE]
  xdat <- mf[, variableNames[[2]], drop = FALSE]
  tbw = npcdensbw(xdat = xdat, ydat = ydat, ...)
  tbw$call <- match.call(expand.dots = FALSE)
  environment(tbw$call) <- parent.frame()
  tbw$formula <- formula
  tbw$rows.omit <- as.vector(attr(mf, "na.action"))
  tbw$nobs.omit <- length(tbw$rows.omit)
  tbw$terms <- attr(mf, "terms")
  tbw$variableNames <- variableNames
  tbw
}

explodeFormula <- function (formula){
  res <- strsplit(strsplit(paste(deparse(formula), collapse = ""),
                           " *[~] *")[[1]], " *[+] *")
  stopifnot(all(sapply(res, length) > 0))
  names(res) <- c("response", "terms")
  res
}

Try the pssmooth package in your browser

Any scripts or data that you put into this service are public.

pssmooth documentation built on Jan. 13, 2021, 5:56 a.m.