Nothing
#' ROSE random forest estimator for the partially linear model
#'
#' Estimates the parameter of interest \eqn{\theta_0} in the partially linear model
#' \deqn{\mathbb{E}[Y|X,Z] = X\theta_0 + f_0(Z),} which can be reposed in terms of
#' the `nuisance functions' \eqn{(\mathbb{E}[Y|X], \mathbb{E}[X|Z])} as \deqn{\mathbb{E}[Y|X,Z]-\mathbb{E}[Y|Z] = (X-\mathbb{E}[X|Z])\theta_0.}
#'
#' The estimator of interest \eqn{\theta_0} solves the estimating equation
#' \deqn{\sum_{i}\psi(Y_i,X_i,Z_i; \theta,\hat{\eta}(Z),\hat{w}(Z)) = 0,}
#' \deqn{\psi(Y,X,Z;\theta,\eta_0,w) := \sum_{j=1}^J w_j(Z) \big( M_j(X) - \mathbb{E}[M_j(X)|Z]\big) \Big( \big(Y-\mathbb{E}[Y|Z]\big)-\big(X-\mathbb{E}[X|Z]\big)\theta \Big),}
#' \deqn{\eta_0 := \big(\mathbb{E}[Y|Z=\cdot], \mathbb{E}[X|Z=\cdot]\big),}
#' where \eqn{M_1(X),\ldots,M_J(X)} denotes user-chosen functions of \eqn{(X)}
#' and \eqn{w(Z)=\big(w_1(Z),\ldots,w_J(Z)\big)} denotes weights estimated via ROSE random forests.
#' The default takes \eqn{J=1} and \eqn{M_1(X)=X}; if taking \eqn{J\geq 2} we recommend care
#' in checking the applicability and appropriateness of any additional user-chosen
#' regression tasks.
#'
#' The parameter of interest \eqn{\theta_0} is estimated using a DML2 / \eqn{K}-fold cross-fitting
#' framework, to allow for arbitrary (faster than \eqn{n^{1/4}}-consistent) learners for \eqn{\hat{\eta}} i.e. solving
#' the estimating equation
#' \deqn{\sum_{k \in [K]}\sum_{i \in I_k}\psi(Y_i,X_i,Z_i; \theta,\hat{\eta}^{(k)}(Z),\hat{w}^{(k)}(Z)) = 0,}
#' where \eqn{I_1,\ldots,I_K} denotes a partition of the index set for the datapoints \eqn{(Y_i,X_i,Z_i)},
#' \eqn{\hat{\eta}^{(k)}} denotes an estimator for \eqn{\eta_0} trained on the data indexed by
#' \eqn{I_k^c}, and \eqn{\hat{w}^{(k)}} denotes a ROSE random forest (again trained on the data
#' indexed by \eqn{I_k^c}).
#'
#' @param y_formula a two-sided formula object describing the model for \eqn{\mathbb{E}[Y|Z]}.
#' @param y_learner a string specifying the regression method to fit the regression of \eqn{Y} on \eqn{Z} as given by \code{y_formula} (e.g. \code{randomforest, xgboost, neuralnet, gam}).
#' @param y_pars a list containing hyperparameters for the \code{y_learner} chosen. Default is an empty list, which performs hyperparameter tuning.
#' @param x_formula a two-sided formula object describing the model for \eqn{\mathbb{E}[X|Z]}.
#' @param x_learner a string specifying the regression method to fit the regression of \eqn{X} on \eqn{Z} as given by \code{x_formula} (e.g. \code{randomforest, xgboost, neuralnet, gam}).
#' @param x_pars a list containing hyperparameters for the \code{y_learner} chosen. Default is an empty list, which performs hyperparameter tuning.
#' @param M1_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_1(X)|Z]}. Default is \eqn{M_1(X)=X}.
#' @param M1_learner a string specifying the regression method for \eqn{\mathbb{E}[M_1(X)|Z]} estimation.
#' @param M1_pars a list containing hyperparameters for the \code{M1_learner} chosen.
#' @param M2_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_2(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M2_learner a string specifying the regression method for \eqn{\mathbb{E}[M_2(X)|Z]} estimation.
#' @param M2_pars a list containing hyperparameters for the \code{M2_learner} chosen.
#' @param M3_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_3(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1}).
#' @param M3_learner a string specifying the regression method for \eqn{\mathbb{E}[M_3(X)|Z]} estimation.
#' @param M3_pars a list containing hyperparameters for the \code{M3_learner} chosen.
#' @param M4_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_4(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M4_learner a string specifying the regression method for \eqn{\mathbb{E}[M_4(X)|Z]} estimation.
#' @param M4_pars a list containing hyperparameters for the \code{M4_learner} chosen.
#' @param M5_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_5(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M5_learner a string specifying the regression method for \eqn{\mathbb{E}[M_5(X)|Z]} estimation.
#' @param M5_pars a list containing hyperparameters for the \code{M5_learner} chosen.
#' @param data a data frame containing the variables for the partially linear model.
#' @param K the number of folds used for \eqn{K}-fold cross-fitting. Default is 5.
#' @param S the number of repeats to mitigate the randomness in the estimator on the sample splits used for \eqn{K}-fold cross-fitting. Default is 5.
#' @param max.depth Maximum depth parameter used for ROSE random forests. Default is 5.
#' @param num.trees Number of trees used for a single ROSE random forest. Default is 50.
#' @param min.node.size Minimum node size of a leaf in each tree. Default is \code{max(10,ceiling(0.01 (K-1)/K nrow(data)))}.
#' @param replace Whether sampling for a single random tree are performed with (bootstrap) or without replacement. Default is \code{TRUE} (i.e. bootstrap).
#' @param sample.fraction Proportion of data used for each random tree. Default is 0.8.
#'
#' @return A list containing:
#' \describe{
#' \item{\code{theta}}{The estimator of \eqn{\theta_0}.}
#' \item{\code{stderror}}{Huber robust estimate of the standard error of the \eqn{\theta_0}-estimator.}
#' \item{\code{coefficients}}{Table of \eqn{\theta_0} coefficient estimator, standard error, z-value and p-value.}
#' }
#'
#' @importFrom caret createFolds
#' @importFrom stats as.formula median
#' @export
roseRF_plm <- function(y_formula, y_learner, y_pars=list(),
x_formula, x_learner, x_pars=list(),
M1_formula=x_formula, M1_learner=x_learner, M1_pars=x_pars,
M2_formula=NA, M2_learner=NA, M2_pars=list(),
M3_formula=NA, M3_learner=NA, M3_pars=list(),
M4_formula=NA, M4_learner=NA, M4_pars=list(),
M5_formula=NA, M5_learner=NA, M5_pars=list(),
data, K=5, S=1,
max.depth=10, num.trees = 500, min.node.size = max(10,ceiling(0.01*(K-1)/K*nrow(data))), replace = TRUE, sample.fraction = 0.8) {
#Bookkeeping
Z <- unique(c(all.vars(y_formula)[-1],all.vars(x_formula)[-1]))
rose_forest <- create_rose_forest()
# if (!any(deparse(x_formula) %in% lapply(additional_regressions,deparse)))
# additional_regressions <- append(additional_regressions,list(x_formula,x_learner),after=0)
theta_hat_S <- V_hat_S <- numeric(S)
for (s in seq_len(S)) {
#set.seed(s)
cv_folds <- createFolds(data[,1], K)
res_y_on_z_theta_k <- res_x_on_z_theta_k <- list()
fold_sizes <- unlist(lapply(cv_folds,length))
w_fres_ik <- list()
for (k in seq_len(K)) {
cv_fold <- cv_folds[[k]]
data_nuisance <- data[-cv_fold,]
data_theta <- data[cv_fold,]
fit_y_reg <- regress_nuisance(y_formula, y_learner, data_nuisance, data_theta, y_pars)
res_y_on_z_nuisance <- fit_y_reg$residuals_nuisance
res_y_on_z_theta <- fit_y_reg$residuals_theta
res_y_on_z_theta_k[[k]] <- res_y_on_z_theta
fit_x_reg <- regress_nuisance(x_formula, x_learner, data_nuisance, data_theta, x_pars)
res_x_on_z_nuisance <- fit_x_reg$residuals_nuisance
res_x_on_z_theta <- fit_x_reg$residuals_theta
res_x_on_z_theta_k[[k]] <- res_x_on_z_theta
# Intial theta_k estimate
theta_k <- sum(res_y_on_z_nuisance*res_x_on_z_nuisance)/sum(res_x_on_z_nuisance^2)
# Generate ROSE random forest weights for each j=1:J
d_theta_eps_nuisance <- - res_x_on_z_nuisance
eps_nuisance <- res_y_on_z_nuisance - theta_k * res_x_on_z_nuisance
eps_sq_nuisance <- eps_nuisance^2
M_formulae = list(M1_formula, M2_formula, M3_formula, M4_formula, M5_formula)
M_learners = list(M1_learner, M2_learner, M3_learner, M4_learner, M5_learner)
M_parses = list(M1_pars, M2_pars, M3_pars, M4_pars, M5_pars)
J = sum(unlist(lapply(M_formulae, function(x) deparse(x)!="NA")))
d_theta_psi_nuisance_j <- psi_sq_nuisance_j <- psi_nuisance_j <- res_f_on_z_theta_j <- list()
for (j in seq_len(J)) {
# Check in jth F-regression already done (i.e. whether its x-regression)
# Do F-regression if necessary
f_formula <- M_formulae[[j]]
f_learner <- M_learners[[j]]
f_pars = M_parses[[j]]
if (f_formula==x_formula & f_learner==x_learner ){
res_fj_on_z_nuisance <- res_x_on_z_nuisance
res_fj_on_z_theta <- res_x_on_z_theta
} else {
fit_fj_reg <- regress_nuisance(f_formula, f_learner, data_nuisance, data_theta, f_pars)
res_fj_on_z_nuisance <- fit_fj_reg$residuals_nuisance
res_fj_on_z_theta <- fit_fj_reg$residuals_theta
}
d_theta_psi_nuisance_j[[j]] <- res_fj_on_z_nuisance * d_theta_eps_nuisance
psi_sq_nuisance_j[[j]] <- res_fj_on_z_nuisance^2 * eps_sq_nuisance
psi_nuisance_j[[j]] <- res_fj_on_z_nuisance * eps_nuisance
res_f_on_z_theta_j[[j]] <- res_fj_on_z_theta
}
forest_data <- data_nuisance
for (j in seq_len(J)) {
forest_data[[paste0("d_theta_psi_",j)]] <-d_theta_psi_nuisance_j[[j]]
forest_data[[paste0("psi_sq_",j)]] <- psi_sq_nuisance_j[[j]]
forest_data[[paste0("psi_",j)]] <- psi_nuisance_j[[j]]
}
if (J==1) {
w_theta_j_fit <- rose_forest_J1(data_train=forest_data, data_theta=data_theta, J=J, Z=Z, max.depth=max.depth, num.trees=num.trees, min.node.size=min.node.size, replace=replace, sample.fraction=sample.fraction)
w_fres_ik[[k]] <- w_theta_j_fit*res_f_on_z_theta_j[[1]]
} else {
w_theta_j_fit <- rose_forest_Jgtr1(data_train=forest_data, data_theta=data_theta, J=J, Z=Z, max.depth=max.depth, num.trees=num.trees, min.node.size=min.node.size, replace=replace, sample.fraction=sample.fraction)
w_fres_ik[[k]] <- sapply(Map(function(x1,x2) x1*x2, w_theta_j_fit, res_f_on_z_theta_j),sum)
}
}
# Special for linear scores (so can calculate all analytically in one step)
d_theta_psi <- - unlist(w_fres_ik)*unlist(res_x_on_z_theta_k)
sum_d_theta_psi <- sum(d_theta_psi)
psi_b <- - unlist(w_fres_ik)*unlist(res_y_on_z_theta_k)
sum_psi_b <- sum(psi_b)
theta_hat <- sum_psi_b / sum_d_theta_psi
psi <- - psi_b + theta_hat * d_theta_psi
psi_sq <- psi^2
V_hat <- nrow(data) * sum(psi_sq) / sum_d_theta_psi^2
theta_hat_S[s] <- theta_hat
V_hat_S[s] <- V_hat
}
# Aggregate over S repetitions
se <- sqrt(median(V_hat_S/nrow(data)+(theta_hat_S-median(theta_hat_S))^2))
thet <- median(theta_hat_S)
zval <- tval <- thet/se
tab <- cbind(Estimate = thet,
StdErr = se,
z.value = zval,
p.value = 2*pnorm(-abs(zval)))
rownames(tab) <- all.vars(x_formula)[1]
colnames(tab) <- c("Estimate", "Std. Err", "z value", "Pr(>|z|)")
res <- structure(list(theta=thet, stderror=se, coefficients=tab, model=list(model.type="plm")), class="roseforest")
res$coefficients
return(res)
}
#' ROSE random forest estimator for the generalised partially linear model
#'
#' Estimates the parameter of interest \eqn{\theta_0} in the generalised partially linear model
#' \deqn{g(\mathbb{E}[Y|X,Z]) = X\theta_0 + f_0(Z),} for some (strictly increasing, differentiable) link function \eqn{g}, which can be reposed in terms of
#' the `nuisance functions' \eqn{(\mathbb{E}[X|Z], \mathbb{E}[g(\mathbb{E}[Y|X,Z])|Z])} as \deqn{g\big(\mathbb{E}[Y|X,Z])-\mathbb{E}[g(\mathbb{E}[Y|X,Z])|Z]\big) = (X-\mathbb{E}[X|Z])\theta_0.}
#'
#' The estimator of interest \eqn{\theta_0} solves the estimating equation
#' \deqn{\sum_{i}\psi(Y_i,X_i,Z_i; \theta,\hat{\eta}(Z),\hat{w}(Z)) = 0,}
#' \deqn{\psi(Y,X,Z;\theta,\eta_0,w) := \sum_{j=1}^J w_j(Z) \big( M_j(X) - \mathbb{E}[M_j(X)|Z] \big) g'(\mu(X,Z;\theta,\eta_0)) \big(Y-\mu(X,Z;\theta,\eta_0)\big) ,}
#' \deqn{\mu(X,Z;\theta,\eta_0) := g^{-1}\big(\mathbb{E}[g(\mathbb{E}[Y|X,Z])|Z] + (X-\mathbb{E}[X|Z])\theta\big),}
#' \deqn{\eta_0 := \big(\mathbb{E}[Y|Z=\cdot], \mathbb{E}[X|Z=\cdot]\big),}
#' where \eqn{M_1(X),\ldots,M_J(X)} denotes user-chosen functions of \eqn{(X)}
#' and \eqn{w(Z)=\big(w_1(Z),\ldots,w_J(Z)\big)} denotes weights estimated via ROSE random forests.
#' The default takes \eqn{J=1} and \eqn{M_1(X)=X}; if taking \eqn{J\geq 2} we recommend care
#' in checking the applicability and appropriateness of any additional user-chosen
#' regression tasks.
#'
#' The parameter of interest \eqn{\theta_0} is estimated using a DML2 / \eqn{K}-fold cross-fitting
#' framework, to allow for arbitrary (faster than \eqn{n^{1/4}}-consistent) learners for \eqn{\hat{\eta}} i.e. solving
#' the estimating equation
#' \deqn{\sum_{k \in [K]}\sum_{i \in I_k}\psi(Y_i,X_i,Z_i; \theta,\hat{\eta}^{(k)}(Z),\hat{w}^{(k)}(Z)) = 0,}
#' where \eqn{I_1,\ldots,I_K} denotes a partition of the index set for the datapoints \eqn{(Y_i,X_i,Z_i)},
#' \eqn{\hat{\eta}^{(k)}} denotes an estimator for \eqn{\eta_0} trained on the data indexed by
#' \eqn{I_k^c}, and \eqn{\hat{w}^{(k)}} denotes a ROSE random forest (again trained on the data
#' indexed by \eqn{I_k^c}).
#'
#' @param y_on_xz_formula a two-sided formula object describing the model for \eqn{\mathbb{E}[Y|X,Z]} (regressing \eqn{Y} on \eqn{(X)}).
#' @param y_on_xz_learner a string specifying the regression method to fit the regression as given by \code{y_on_xz_formula} (e.g. \code{randomforest, xgboost, neuralnet, gam}).
#' @param y_on_xz_pars a list containing hyperparameters for the \code{y_on_xz_learner} chosen. Default is an empty list, which performs hyperparameter tuning.
#' @param Gy_on_z_formula a two-sided formula object describing the model for \eqn{\mathbb{E}[g(\mathbb{E}[Y|X,Z])|Z]} (regressing \eqn{g(\hat{E}[Y|X,Z])} on \eqn{Z}).
#' @param Gy_on_z_learner a string specifying the regression method to fit the regression as given by \code{Gy_on_z_formula} (e.g. \code{randomforest, xgboost, neuralnet, gam}).
#' @param Gy_on_z_pars a list containing hyperparameters for the \code{Gy_on_z_learner} chosen. Default is an empty list, which performs hyperparameter tuning.
#' @param x_formula a two-sided formula object describing the model for \eqn{\mathbb{E}[X|Z]}.
#' @param x_learner a string specifying the regression method to fit the regression of \eqn{X} on \eqn{Z} as given by \code{x_formula} (e.g. \code{randomforest, xgboost, neuralnet, gam}).
#' @param x_pars a list containing hyperparameters for the \code{x_learner} chosen. Default is an empty list, which performs hyperparameter tuning.
#' @param M1_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_1(X)|Z]}. Default is \eqn{M_1(X)=X}.
#' @param M1_learner a string specifying the regression method for \eqn{\mathbb{E}[M_1(X)|Z]} estimation.
#' @param M1_pars a list containing hyperparameters for the \code{M1_learner} chosen.
#' @param M2_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_2(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M2_learner a string specifying the regression method for \eqn{\mathbb{E}[M_2(X)|Z]} estimation.
#' @param M2_pars a list containing hyperparameters for the \code{M2_learner} chosen.
#' @param M3_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_3(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1}).
#' @param M3_learner a string specifying the regression method for \eqn{\mathbb{E}[M_3(X)|Z]} estimation.
#' @param M3_pars a list containing hyperparameters for the \code{M3_learner} chosen.
#' @param M4_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_4(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M4_learner a string specifying the regression method for \eqn{\mathbb{E}[M_4(X)|Z]} estimation.
#' @param M4_pars a list containing hyperparameters for the \code{M4_learner} chosen.
#' @param M5_formula a two-sided formula object for the model \eqn{\mathbb{E}[M_5(X)|Z]}. Default is no formula / regression (i.e. \eqn{J=1})
#' @param M5_learner a string specifying the regression method for \eqn{\mathbb{E}[M_5(X)|Z]} estimation.
#' @param M5_pars a list containing hyperparameters for the \code{M5_learner} chosen.
#' @param link link function (\eqn{g}). Options include \code{identity}, \code{log}, \code{sqrt}, \code{logit}, \code{probit}. Default is \code{identity}.
#' @param data a data frame containing the variables for the partially linear model.
#' @param K the number of folds used for \eqn{K}-fold cross-fitting. Default is 5.
#' @param S the number of repeats to mitigate the randomness in the estimator on the sample splits used for \eqn{K}-fold cross-fitting. Default is 5.
#' @param max.depth Maximum depth parameter used for ROSE random forests. Default is 5.
#' @param num.trees Number of trees used for a single ROSE random forest. Default is 50.
#' @param min.node.size Minimum node size of a leaf in each tree. Default is \code{max(10,ceiling(0.01 (K-1)/K nrow(data)))}.
#' @param replace Whether sampling for a single random tree are performed with (bootstrap) or without replacement. Default is \code{TRUE} (i.e. bootstrap).
#' @param sample.fraction Proportion of data used for each random tree. Default is 0.8.
#'
#' @return A list containing:
#' \describe{
#' \item{\code{theta}}{The estimator of \eqn{\theta_0}.}
#' \item{\code{stderror}}{Huber robust estimate of the standard error of the \eqn{\theta_0}-estimator.}
#' \item{\code{coefficients}}{Table of \eqn{\theta_0} coefficient estimator, standard error, z-value and p-value.}
#' }
#'
#' @importFrom caret createFolds
#' @importFrom stats uniroot as.formula median
#' @export
roseRF_gplm <- function(y_on_xz_formula, y_on_xz_learner, y_on_xz_pars=list(),
Gy_on_z_formula, Gy_on_z_learner, Gy_on_z_pars=list(),
x_formula, x_learner, x_pars=list(),
M1_formula=x_formula, M1_learner=x_learner, M1_pars=x_pars,
M2_formula=NA, M2_learner=NA, M2_pars=list(),
M3_formula=NA, M3_learner=NA, M3_pars=list(),
M4_formula=NA, M4_learner=NA, M4_pars=list(),
M5_formula=NA, M5_learner=NA, M5_pars=list(),
link="identity", data, K=5, S=1, max.depth=10, num.trees = 500, min.node.size = max(10,ceiling(0.01*(K-1)/K*nrow(data))), replace = TRUE, sample.fraction = 0.8) {
G_link <- create_link(link)
G_response <- all.vars(Gy_on_z_formula)[1]
response <- all.vars(y_on_xz_formula)[1]
Z <- unique(c(all.vars(Gy_on_z_formula)[-1],all.vars(x_formula)[-1]))
theta_hat_S <- V_hat_S <- numeric(S)
for (s in seq_len(S)) {
#set.seed(s)
cv_folds <- createFolds(data[,1], K)
y_theta_k <- fitted_Gy_on_z_theta_k <- res_x_on_z_theta_k <- list()
fold_sizes <- unlist(lapply(cv_folds,length))
w_fres_ik <- list()
for (k in seq_len(K)) {
cv_fold <- cv_folds[[k]]
data_nuisance <- data[-cv_fold,]
data_theta <- data[cv_fold,]
fit_y_on_xz <- regress_nuisance(y_on_xz_formula, y_on_xz_learner, data_nuisance, data_theta, y_on_xz_pars)
G_pseudo_res <- G_link$linkfun(G_link$muspaceproj(fit_y_on_xz$fitted_nuisance)) ##################
G_pseudo_df_nuisance <- data_nuisance
if (G_response %in% colnames(G_pseudo_df_nuisance)) G_pseudo_df_nuisance <- G_pseudo_df_nuisance[,!(colnames(G_pseudo_df_nuisance)%in%G_response)]
G_pseudo_df_nuisance <- cbind(G_vals=G_pseudo_res, G_pseudo_df_nuisance)
colnames(G_pseudo_df_nuisance)[1] <- G_response
fit_Gy_reg <- regress_nuisance(Gy_on_z_formula, Gy_on_z_learner, G_pseudo_df_nuisance, data_theta, Gy_on_z_pars)
fitted_Gy_on_z_nuisance <- fit_Gy_reg$fitted_nuisance
fitted_Gy_on_z_theta <- fit_Gy_reg$fitted_theta
fitted_Gy_on_z_theta_k[[k]] <- fitted_Gy_on_z_theta
y_theta_k[[k]] <- data_theta[,response]
fit_x_reg <- regress_nuisance(x_formula, x_learner, data_nuisance, data_theta, x_pars)
res_x_on_z_nuisance <- fit_x_reg$residuals_nuisance
res_x_on_z_theta <- fit_x_reg$residuals_theta
res_x_on_z_theta_k[[k]] <- res_x_on_z_theta
# Inital theta_hat_k estimate
eval_theta_k <- function(theta) {
mu <- G_link$linkinv( res_x_on_z_nuisance*theta + fitted_Gy_on_z_nuisance )
eps <- G_link$linkderiv(mu) * ( data_nuisance[,response] - mu ) #G_link$linkderiv doesn't exist
score <- sum( res_x_on_z_nuisance * eps )
return(score)
}
if (k==1) {
lower_bd <- -1; upper_bd <- 1
while (eval_theta_k(lower_bd)*eval_theta_k(upper_bd)>0) {
lower_bd <- 10*lower_bd
upper_bd <- 10*upper_bd
}
lower_bd <- 10*lower_bd
upper_bd <- 10*upper_bd
}
theta_k <- uniroot(eval_theta_k,interval=c(lower_bd,upper_bd))$root
mu_nuisance <- G_link$linkinv( res_x_on_z_nuisance*theta_k + fitted_Gy_on_z_nuisance )
d_theta_eps_nuisance <- - res_x_on_z_nuisance
eps_nuisance <- G_link$linkderiv(mu_nuisance) * ( data_nuisance[,response] - mu_nuisance )
eps_sq_nuisance <- eps_nuisance^2
# Generate rose forest weights for each j=1:J
M_formulae <- list(M1_formula, M2_formula, M3_formula, M4_formula, M5_formula)
M_learners <- list(M1_learner, M2_learner, M3_learner, M4_learner, M5_learner)
M_parses <- list(M1_pars, M2_pars, M3_pars, M4_pars, M5_pars)
J = sum(unlist(lapply(M_formulae, function(x) deparse(x)!="NA")))
d_theta_psi_nuisance_j <- psi_sq_nuisance_j <- psi_nuisance_j <- res_f_on_z_theta_j <- list()
for (j in seq_len(J)) {
# Check in jth F-regression already done (i.e. whether its x-regression)
# Do F-regression if necessary
f_formula <- M_formulae[[j]]
f_learner <- M_learners[[j]]
f_pars <- M_parses[[j]]
if (f_formula==x_formula & f_learner==x_learner){
res_fj_on_z_nuisance <- res_x_on_z_nuisance
res_fj_on_z_theta <- res_x_on_z_theta
} else {
fit_fj_reg <- regress_nuisance(f_formula, f_learner, data_nuisance, data_theta, f_pars)
res_fj_on_z_nuisance <- fit_fj_reg$residuals_nuisance
res_fj_on_z_theta <- fit_fj_reg$residuals_theta
}
d_theta_psi_nuisance_j[[j]] <- res_fj_on_z_nuisance * d_theta_eps_nuisance
psi_sq_nuisance_j[[j]] <- res_fj_on_z_nuisance^2 * eps_sq_nuisance
psi_nuisance_j[[j]] <- res_fj_on_z_nuisance * eps_nuisance
res_f_on_z_theta_j[[j]] <- res_fj_on_z_theta
}
forest_data <- data_nuisance
for (j in seq_len(J)) {
forest_data[[paste0("d_theta_psi_",j)]]=d_theta_psi_nuisance_j[[j]]
forest_data[[paste0("psi_sq_",j)]]=psi_sq_nuisance_j[[j]]
forest_data[[paste0("psi_",j)]]=psi_nuisance_j[[j]]
}
if (J==1) {
w_theta_j_fit <- rose_forest_J1(data_train=forest_data, data_theta=data_theta, J=J, Z=Z, max.depth=max.depth, num.trees=num.trees, min.node.size=min.node.size, replace=replace, sample.fraction=sample.fraction)
w_fres_ik[[k]] <- w_theta_j_fit*res_f_on_z_theta_j[[1]]
} else {
w_theta_j_fit <- rose_forest_Jgtr1(data_train=forest_data, data_theta=data_theta, J=J, Z=Z, max.depth=max.depth, num.trees=num.trees, min.node.size=min.node.size, replace=replace, sample.fraction=sample.fraction)
list_overj_wfres <- Map(function(x1,x2) x1*x2, w_theta_j_fit, res_f_on_z_theta_j)
w_fres_ik[[k]] <- sapply(seq_along(list_overj_wfres[[1]]), function(i) {
sum(sapply(list_overj_wfres, `[`, i))
})
}
}
# Construct function to evaluate score at a theta value
eval_theta_full <- function(theta) {
mu <- G_link$linkinv( unlist(res_x_on_z_theta_k)*theta + unlist(fitted_Gy_on_z_theta_k) )
eps <- G_link$linkderiv(mu) * ( unlist(y_theta_k) - mu )
score <- sum( unlist(w_fres_ik) * eps )
return(score)
}
theta_hat <- uniroot(eval_theta_full,interval=c(lower_bd,upper_bd))$root
mu_theta <- G_link$linkinv( unlist(res_x_on_z_theta_k)*theta_hat + unlist(fitted_Gy_on_z_theta_k) )
eps <- G_link$linkderiv(mu_theta) * ( unlist(y_theta_k) - mu_theta )
psi <- unlist(w_fres_ik) * eps
psi_sq <- psi^2
d_theta_eps <- - unlist(res_x_on_z_theta_k)
d_theta_psi <- unlist(w_fres_ik) * d_theta_eps
V_hat <- nrow(data) * sum(psi_sq) / (sum(d_theta_psi))^2
theta_hat_S[s] <- theta_hat
V_hat_S[s] <- V_hat
}
# Aggregate over S repetitions
se <- sqrt(median(V_hat_S/nrow(data)+(theta_hat_S-median(theta_hat_S))^2))
thet <- median(theta_hat_S)
zval <- tval <- thet/se
tab <- cbind(Estimate = thet,
StdErr = se,
z.value = zval,
p.value = 2*pnorm(-abs(zval)))
rownames(tab) <- all.vars(x_formula)[1]
colnames(tab) <- c("Estimate", "Std. Err", "z value", "Pr(>|z|)")
res <- structure(list(theta=thet, stderror=se, coefficients=tab, model=list(model.type="gplm", link=link)), class="roseforest")
res$coefficients
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.