#' General Template for Collaborative Targeted Maximum Likelihood Estimation
#'
#' This function computes the Collaborative Targeted Maximum Likelihood Estimator.
#'
#'@import glmnet
#'@import SuperLearner
#'@import tmle
#'@import stats
#' @export
#' @param Y continuous or binary outcome variable
#' @param A binary treatment indicator, 1 for treatment, 0 for control
#' @param W vector, matrix, or dataframe containing baseline covariates for Q bar
#' @param Wg vector, matrix, or dataframe containing baseline covariates for propensity score model
#' (defaults to W if not supplied by user)
#' @param Q n by 2 matrix of initial values for Q0W, Q1W in columns 1 and 2, respectively.
#' Current version does not support SL for automatic initial estimation of Q bar
#' @param ctmletype 1 or 3. Type of general C-TMLE. Type 1 uses cross-validation to select best gn,
#' while Type 3 directly solves extra clever covariates.
#' @param gn_candidates matrix or dataframe, each column stand for a estimate of propensity score.
#' Estimate in the column with larger index should have smaller empirical loss
#' @param gn_candidates_cv matrix or dataframe, each column stand for a the cross-validated estimate.
#' For example, the (i,j)-th element is the predicted propensity score by j-th estimator,
#' for i-th observation, when it is in the validation set
#' @param folds The list of indices for cross-validation step. We recommend the cv-splits in C-TMLE matchs that in gn_candidate_cv
#' @param V Number of folds. Only used if folds is not specified
#' @param alpha used to keep predicted initial values bounded away from (0,1) for logistic fluctuation,
#' 0.995 (default)
#' @param family family specification for working regression models, generally 'gaussian' for continuous
#' outcomes (default), 'binomial' for binary outcomes
#' @param like_type 'RSS' or 'loglike'. The metric to use for forward selection and cross-validation
#' @param gbound bound on P(A=1|W), defaults to 0.025
#' @param fluctuation 'logistic' (default) or 'linear', for targeting step
#' @param verbose print status messages if TRUE
#' @param PEN boolean. If true, penalized loss is used in cross-validation step
#' @param g1W Only used when type is 3. a user-supplied propensity score estimate.
#' @param g1WPrev Only used when type is 3. a user-supplied propensity score estimate,
#' with small fluctuation compared to g1W.
#' @param stopFactor Numerical value with default 1e6.
#' If the current empirical likelihood is stopFactor times larger than the best previous one,
#' the construction would stop
#' @param detailed boolean number. If it is TRUE, return more detailed results
#' @return best_k the index of estimate that selected by cross-validation
#' @return est estimate of psi_0
#' @return CI IC-based 95% confidence interval for parameter estimate
#' @return pvalue pvalue for the null hypothesis that Psi = 0
#' @return likelihood sum of squared residuals, based on selected estimator evaluated on all obs or, logistic loglikelihood if like_type != "RSS"
#' @return varIC empirical variance of the influence curve adjusted for estimation of g
#' @return varDstar empirical variance of the influence curve
#' @return var.psi variance of the estimate
#' @return varIC.cv cross-validated variance of the influence curve
#' @return penlikelihood.cv penalized cross-validated likelihood
#' @return cv.res all cross-validation results for each fold
#' @examples
#'N <- 1000
#'p = 100
#'V = 5
#'Wmat <- matrix(rnorm(N * p), ncol = p)
#'gcoef <- matrix(c(-1,-1,rep(-(3/((p)-2)),(p)-2)),ncol=1)
#'
#'W <- as.data.frame(Wmat)
#'g <- 1/(1+exp(Wmat%*%gcoef / 3))
#'A <- rbinom(N, 1, prob = g)
#'
#'# Build potential outcome pairs, and the observed outcome Y
#'beta1 <- 4+2*Wmat[,1]+2*Wmat[,2]+2*Wmat[,5]+2*Wmat[,6]+2*Wmat[,8]
#'beta0 <- 2+2*Wmat[,1]+2*Wmat[,2]+2*Wmat[,5]+2*Wmat[,6]+2*Wmat[,8]
#'
#'tau = 2
#'sigma <- 1
#'epsilon <-rnorm(N,0,sigma)
#'Y <- beta0 + tau * A + epsilon
#'# Initial estimate of Q
#'Q <- cbind(rep(mean(Y[A == 1]), N), rep(mean(Y[A == 0]), N))
#'
#'folds <-by(sample(1:N,N), rep(1:V, length=N), list)
#'
#'lasso_fit <- cv.glmnet(x = as.matrix(W), y = A, alpha = 1, nlambda = 100, nfolds = 10)
#'lasso_lambdas <- lasso_fit$lambda[lasso_fit$lambda <= lasso_fit$lambda.min][1:5]
#'# Build template for glmnet
#'SL.glmnet_new <- function (Y, X, newX, family, obsWeights, id, alpha = 1,
#' nlambda = 100, lambda = 0,...)
#'{
#' # browser()
#' if (!is.matrix(X)) {
#' X <- model.matrix(~-1 + ., X)
#' newX <- model.matrix(~-1 + ., newX)
#' }
#' fit <- glmnet::glmnet(x = X, y = Y,
#' lambda = lambda,
#' family = family$family, alpha = alpha)
#' pred <- predict(fit, newx = newX, type = "response")
#' fit <- list(object = fit)
#' class(fit) <- "SL.glmnet"
#' out <- list(pred = pred, fit = fit)
#' return(out)
#'}
#'
#'# Use a sequence of LASSO estimators to build gn sequence:
#'SL.cv1lasso <- function (... , alpha = 1, lambda = lasso_lambdas[1]){
#' SL.glmnet_new(... , alpha = alpha, lambda = lambda)
#'}
#'
#'SL.cv2lasso <- function (... , alpha = 1, lambda = lasso_lambdas[2]){
#' SL.glmnet_new(... , alpha = alpha, lambda = lambda)
#'}
#'
#'SL.cv3lasso <- function (... , alpha = 1, lambda = lasso_lambdas[3]){
#' SL.glmnet_new(... , alpha = alpha, lambda = lambda)
#'}
#'
#'SL.cv4lasso <- function (... , alpha = 1, lambda = lasso_lambdas[4]){
#' SL.glmnet_new(... , alpha = alpha, lambda = lambda)
#'}
#'
#'SL.library = c('SL.cv1lasso', 'SL.cv2lasso', 'SL.cv3lasso', 'SL.cv4lasso', 'SL.glm')
#'
#'# Build the sequence. See more details in the function build_gn_seq, and package SuperLearner
#'gn_seq <- build_gn_seq(A = A, W = W, SL.library = SL.library, folds = folds)
#'
#'
#'# Use the output of build_gn_seq for ctmle general templates
#'ctmle_fit <- ctmleGeneral(Y = Y, A = A, W = W, Q = Q, ctmletype = 1,
#' gn_candidates = gn_seq$gn_candidates,
#' gn_candidates_cv = gn_seq$gn_candidates_cv,
#' folds = gn_seq$folds, V = length(folds))
ctmleGeneral <- function(Y, A, W, Wg = W, Q,
ctmletype,
gn_candidates,
gn_candidates_cv = NULL,
alpha=.995, family= "gaussian",
gbound=.025, like_type = "RSS",
fluctuation="logistic", verbose=FALSE,
detailed=FALSE, PEN=FALSE,
g1W = NULL, g1WPrev = NULL,
V=5, folds = NULL,
stopFactor=10^6) {
if(ctmletype != 3){
if(is.null(gn_candidates_cv)){
gn_candidates_cv <- gn_candidates
warning('All gn candidates are only estimated on whole data.
Cross-validation might not be objective enough!')
}else if(is.null(folds)){
warning('It is more objective to provide folds, corresponding to gn_candidate_cv.')
}
}
n <- length(Y)
maptoYstar <- fluctuation=="logistic"
Qbounds <- range(Y)
QAW <- (1 - A) * Q[, 1] + A * Q[, 2]
Q <- cbind(QAW, Q0W = Q[, 1], Q1W = Q[, 2])
Q <- bound(Q, Qbounds)
Ystar <- bound(Y, Qbounds)
ab <- range(Ystar, na.rm = TRUE)
Ystar[is.na(Ystar)] <- 0
Ystar <- (Ystar - ab[1])/diff(ab)
Q <- (Q - ab[1])/diff(ab)
Qbounds <- c(alpha, 1 - alpha)
Q <- bound(Q, Qbounds)
Q <- qlogis(Q)
family.stage2 <- "binomial"
Qinit<-list(Q=Q, Y=Ystar, Qbounds=Qbounds, ab=ab, family=family.stage2)
if(ctmletype == 3){
#------------------------------------------------------------------------
#-----------------------ctmleGeneral type 3-------------------------------
#------------------------------------------------------------------------
if(is.null(g1W) | is.null(g1WPrev)){
stop("For cglmeGlmnet type 3 needs inute g1W and g1WPrev")
}
g <- g1W
Y=Qinit$Y
X=data.frame(A,Wg)
Q=Qinit$Q
family=Qinit$family
Qbounds=Qinit$Qbounds
ab=Qinit$ab
gbound=gbound
g1W.total1 <- bound(g1WPrev, c(gbound, 1-gbound))
g0W.total1 <- 1 - bound(g1WPrev, c(gbound, 1-gbound))
g1W.total <- bound(g, c(gbound, 1-gbound))
g0W.total <- 1 - bound(g, c(gbound, 1-gbound))
H1W <- X[,1]/g1W.total
H0W <- (1 - X[,1])/g0W.total
ddg1<-(g1W.total-g1W.total1)
ddg1[which(ddg1==0)]<-1e-10
ddg0<-(g0W.total-g0W.total1)
ddg0[which(ddg0==0)]<-1e-10
ddH1W <- (X[,1]/(g1W.total^2)) * ddg1
ddH0W <- ((1 - X[,1])/(g0W.total^2)) * ddg0
suppressWarnings(epsilon<- coef(glm(Y ~ -1 +
offset(Q[, "QAW"])+
H0W+H1W+
ddH0W + ddH1W, family = family)))
Qstar <- Q + cbind((epsilon[1] * H0W+ epsilon[2] * H1W),
epsilon[1]/g0W.total, epsilon[2]/g1W.total)+
cbind((epsilon[3] * ddH0W+ epsilon[4] * ddH1W),
(epsilon[3]/(g0W.total^2))*ddg0, (epsilon[4]/(g1W.total^2))*ddg1)
if(family[1]=="binomial"){
Qstar <- plogis(Qstar)
}
mu1 <- mean(Qstar[,"Q1W"])
mu0 <- mean(Qstar[,"Q0W"])
est <- (mean(Qstar[,"Q1W"]) - mean(Qstar[,"Q0W"]))*diff(ab)
Dstar <- (A/g1W.total - (1 - A)/(1-g1W.total)) * (Y - Qstar[, "QAW"]) +
Qstar[, "Q1W"] - Qstar[, "Q0W"] - (mean(Qstar[,"Q1W"]) - mean(Qstar[,"Q0W"]))
var.psi <- var(Dstar)/length(Y) * (diff(ab)^2)
CI <- est + c(-1.96,1.96) * sqrt(var.psi)
# !!! Next step: add more details
emptyInfo <- 'Not Applicable'
returnVal <- (list(candidates=emptyInfo,
best_k=emptyInfo,
est= est,
CI = CI,
Qinit=Qinit,
ab = Qinit$ab,
# Next step: provide likelihood.
likelihood= emptyInfo,
varIC = emptyInfo,
varDstar = emptyInfo,
varIC.cv = emptyInfo,
var.psi = var.psi,
penlikelihood.cv = emptyInfo,
cv.res = emptyInfo)
)
returnVal$pvalue <- 2 * pnorm(-abs(returnVal$est)/sqrt(returnVal$var.psi))
# !!!
class(returnVal) <- "ctmle"
return(returnVal)
}else{
# Construct by gn estimated with whole data
# !!Notice here gns are gn_candidates
candidates <- stage2_general(Y=Qinit$Y, X=data.frame(A,Wg), Q=Qinit$Q,
gn_candidates = gn_candidates,
ctmletype=ctmletype,
family=Qinit$family,
Qbounds=Qinit$Qbounds,ab=Qinit$ab,
# Use whole data as training
training_set=rep(TRUE,length(Y)),
like_type=like_type,gbound=gbound, verbose=verbose,
stopFactor=stopFactor)
if(verbose) {cat("\t\t-----Cross-validation to select estimator-----\n")}
# Select best k by CV
# !!Notice here gns are gn_candidates_cv
cv.res <- cv_general(Y=Qinit$Y,X=data.frame(A,Wg),
est.all=candidates$results.all$est,
Q=Qinit$Q,
gn_candidates = gn_candidates_cv,
ctmletype=ctmletype,
family=Qinit$family,
Qbounds=Qinit$Qbounds,
ab=Qinit$ab,
like_type=like_type,gbound=gbound,
verbose=verbose, PEN=PEN , V=V,
folds = folds)
Q.init <- Qinit$Q
if(family == "binomial" | maptoYstar){Q.init <- plogis(Q.init)}
psi.init <- mean(Q.init[,"Q1W"] - Q.init[,"Q0W"]) * diff(Qinit$ab)
if(verbose) {
cat("\tEstimate based on initial, untargeted Q:", round(psi.init,4),"\n")
cat(paste("\tCross validation complete.\n"))
}
candidates$results.all$varIC_origscale <-
candidates$results.all$varIC * ifelse(maptoYstar, (diff(Qinit$ab))^2, 1)
candidates$results.all$varDstar_origscale <-
candidates$results.all$varDstar * ifelse(maptoYstar, (diff(Qinit$ab))^2, 1)
# browser()
returnVal <- (list(candidates=candidates,
best_k=cv.res$best_k,
est= candidates$results.all$est[cv.res$best_k],
CI = candidates$results.all$est[cv.res$best_k] +
c(-1.96,1.96) * sqrt(candidates$results.all$varIC_origscale[cv.res$best_k]/n),
Qinit=Qinit,
ab = Qinit$ab,
likelihood= candidates$results.all$likelihood[cv.res$best_k],
varIC = candidates$results.all$varIC_origscale[cv.res$best_k],
varDstar = candidates$results.all$varDstar_origscale[cv.res$best_k],
varIC.cv=cv.res$varIC[cv.res$best_k],
var.psi=candidates$results.all$varIC_origscale[cv.res$best_k]/n,
penlikelihood.cv = cv.res$penlikelihood[cv.res$best_k],
cv.res=cv.res))
returnVal$pvalue <- 2*pnorm(-abs(returnVal$est)/sqrt(returnVal$var.psi))
just_enough <- c(2:4,6,8,9,11,14)
if (!detailed) {
returnVal <- returnVal[just_enough]
}
}
class(returnVal) <- "ctmle"
return(returnVal)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.