#' Sieve maximum likelihood estimator (SMLE) for two-phase linear regression problems
#'
#' Performs efficient semiparametric estimation for general two-phase measurement error models when there are errors in both the outcome and covariates.
#'
#' @param Y_unval Column name of the error-prone or unvalidated continuous outcome. Subjects with missing values of \code{Y_unval} are omitted from the analysis. This argument is required.
#' @param Y Column name that stores the validated value of \code{Y_unval} in the second phase. Subjects with missing values of \code{Y} are considered as those not selected in the second phase. This argument is required.
#' @param X_unval Specifies the columns of the error-prone covariates. Subjects with missing values of \code{X_unval} are omitted from the analysis. This argument is required.
#' @param X Specifies the columns that store the validated values of \code{X_unval} in the second phase. Subjects with missing values of \code{X} are considered as those not selected in the second phase. This argument is required.
#' @param Bspline Specifies the columns of the B-spline basis. Subjects with missing values of \code{Bspline} are omitted from the analysis. This argument is required.
#' @param Z Specifies the columns of the accurately measured covariates. Subjects with missing values of \code{Z} are omitted from the analysis. This argument is optional.
#' @param data Specifies the name of the dataset. This argument is required.
#' @param hn_scale Specifies the scale of the perturbation constant in the variance estimation. For example, if \code{hn_scale = 0.5}, then the perturbation constant is \eqn{0.5n^{-1/2}}, where \eqn{n} is the first-phase sample size. The default value is \code{1}. This argument is optional.
#' @param MAX_ITER Maximum number of iterations in the EM algorithm. The default number is \code{1000}. This argument is optional.
#' @param TOL Specifies the convergence criterion in the EM algorithm. The default value is \code{1E-4}. This argument is optional.
#' @param noSE If \code{TRUE}, then the variances of the parameter estimators will not be estimated. The default value is \code{FALSE}. This argument is optional.
#' @param verbose If \code{TRUE}, then show details of the analysis. The default value is \code{FALSE}.
#'
#' @return
#' \item{coefficients}{Stores the analysis results.}
#' \item{sigma}{Stores the residual standard error.}
#' \item{covariance}{Stores the covariance matrix of the regression coefficient estimates.}
#' \item{converge}{In parameter estimation, if the EM algorithm converges, then \code{converge = TRUE}. Otherwise, \code{converge = FALSE}.}
#' \item{converge_cov}{In variance estimation, if the EM algorithm converges, then \code{converge_cov = TRUE}. Otherwise, \code{converge_cov = FALSE}.}
#'
#' @importFrom Rcpp evalCpp
#' @importFrom stats pchisq
#'
#' @references
#' Tao, R., Mercaldo, N. D., Haneuse, S., Maronge, J. M., Rathouz, P. J., Heagerty, P. J., & Schildcrout, J. S. (2021). Two-wave two-phase outcome-dependent sampling designs, with applications to longitudinal binary data. *Statistics in Medicine, 40*(8), 1863–1876. https://doi.org/10.1002/sim.8876
#'
#' @seealso [cv_linear2ph()] to calculate the average predicted log likelihood of this function.
#'
#' @examples
#' rho = -.3
#' p = 0.3
#' hn_scale = 1
#' nsieve = 20
#'
#' n = 100
#' n2 = 40
#' alpha = 0.3
#' beta = 0.4
#' set.seed(12345)
#'
#' ### generate data
#' simX = rnorm(n)
#' epsilon = rnorm(n)
#' simY = alpha+beta*simX+epsilon
#' error = MASS::mvrnorm(n, mu=c(0,0), Sigma=matrix(c(1, rho, rho, 1), nrow=2))
#'
#' simS = rbinom(n, 1, p)
#' simU = simS*error[,2]
#' simW = simS*error[,1]
#' simY_tilde = simY+simW
#' simX_tilde = simX+simU
#'
#' id_phase2 = sample(n, n2)
#'
#' simY[-id_phase2] = NA
#' simX[-id_phase2] = NA
#'
#' # # histogram basis
#' # Bspline = matrix(NA, nrow=n, ncol=nsieve)
#' # cut_x_tilde = cut(simX_tilde, breaks=quantile(simX_tilde, probs=seq(0, 1, 1/nsieve)),
#' # include.lowest = TRUE)
#' # for (i in 1:nsieve) {
#' # Bspline[,i] = as.numeric(cut_x_tilde == names(table(cut_x_tilde))[i])
#' # }
#' # colnames(Bspline) = paste("bs", 1:nsieve, sep="")
#' # # histogram basis
#'
#' # # linear basis
#' # Bspline = splines::bs(simX_tilde, df=nsieve, degree=1,
#' # Boundary.knots=range(simX_tilde), intercept=TRUE)
#' # colnames(Bspline) = paste("bs", 1:nsieve, sep="")
#' # # linear basis
#'
#' # # quadratic basis
#' # Bspline = splines::bs(simX_tilde, df=nsieve, degree=2,
#' # Boundary.knots=range(simX_tilde), intercept=TRUE)
#' # colnames(Bspline) = paste("bs", 1:nsieve, sep="")
#' # # quadratic basis
#'
#' # cubic basis
#' Bspline = splines::bs(simX_tilde, df=nsieve, degree=3,
#' Boundary.knots=range(simX_tilde), intercept=TRUE)
#' colnames(Bspline) = paste("bs", 1:nsieve, sep="")
#' # cubic basis
#'
#' data = data.frame(Y_tilde=simY_tilde, X_tilde=simX_tilde, Y=simY, X=simX, Bspline)
#'
#' res = linear2ph(Y="Y", X="X", Y_unval="Y_tilde", X_unval="X_tilde",
#' Bspline=colnames(Bspline), data=data, hn_scale=0.1)
#' @export
linear2ph <- function (Y_unval=NULL, Y=NULL, X_unval=NULL, X=NULL, Z=NULL, Bspline=NULL, data=NULL, hn_scale=1, noSE=FALSE, TOL=1E-4, MAX_ITER=1000, verbose=FALSE)
{
### linear2ph
###############################################################################################################
#### check data ###############################################################################################
storage.mode(MAX_ITER) = "integer"
storage.mode(TOL) = "double"
storage.mode(noSE) = "integer"
if (missing(data)) {
stop("No dataset is provided!")
}
if (missing(Y_unval)) {
stop("The error-prone response Y_unval is not specified!")
} else {
vars_ph1 = Y_unval
}
if (missing(X_unval)) {
stop("The error-prone covariates X_unval is not specified!")
} else {
vars_ph1 = c(vars_ph1, X_unval)
}
if (missing(Bspline)) {
stop("The B-spline basis is not specified!")
} else {
vars_ph1 = c(vars_ph1, Bspline)
}
if (missing(Y)) {
stop("The accurately measured response Y is not specified!")
}
if (missing(X)) {
stop("The validated covariates in the second-phase are not specified!")
}
if (length(X_unval) != length(X)) {
stop("The number of columns in X_unval and X is different!")
}
if (!missing(Z)) {
vars_ph1 = c(vars_ph1, Z)
}
id_exclude = c()
for (var in vars_ph1) {
id_exclude = union(id_exclude, which(is.na(data[,var])))
}
if (verbose) {
message("There are ", nrow(data), " observations in the dataset.")
message(length(id_exclude), " observations are excluded due to missing Y_unval, X_unval, or Z.")
}
if (length(id_exclude) > 0) {
data = data[-id_exclude,]
}
n = nrow(data)
if (verbose) {
message("There are ", n, " observations in the analysis.")
}
id_phase1 = which(is.na(data[,Y]))
for (var in X) {
id_phase1 = union(id_phase1, which(is.na(data[,var])))
}
if (verbose) {
message("There are ", n-length(id_phase1), " observations validated in the second phase.")
}
#### check data ###############################################################################################
###############################################################################################################
###############################################################################################################
#### prepare analysis #########################################################################################
Y_unval_vec = c(as.vector(data[-id_phase1,Y_unval]), as.vector(data[id_phase1,Y_unval]))
storage.mode(Y_unval_vec) = "double"
X_tilde_mat = rbind(as.matrix(data[-id_phase1,X_unval]), as.matrix(data[id_phase1,X_unval]))
storage.mode(X_tilde_mat) = "double"
Bspline_mat = rbind(as.matrix(data[-id_phase1,Bspline]), as.matrix(data[id_phase1,Bspline]))
storage.mode(Bspline_mat) = "double"
Y_vec = as.vector(data[-id_phase1,Y])
storage.mode(Y_vec) = "double"
X_mat = as.matrix(data[-id_phase1,X])
storage.mode(X_mat) = "double"
if (!is.null(Z)) {
Z_mat = rbind(as.matrix(data[-id_phase1,Z]), as.matrix(data[id_phase1,Z]))
storage.mode(Z_mat) = "double"
}
cov_names = c("Intercept", X)
if (!is.null(Z)) {
cov_names = c(cov_names, Z)
}
ncov = length(cov_names)
X_nc = length(X)
rowmap = rep(NA, ncov)
res_coefficients = matrix(NA, nrow=ncov, ncol=4)
colnames(res_coefficients) = c("Estimate", "SE", "Statistic", "p-value")
rownames(res_coefficients) = cov_names
res_cov = matrix(NA, nrow=ncov, ncol=ncov)
colnames(res_cov) = cov_names
rownames(res_cov) = cov_names
if (is.null(Z)) {
Z_mat = as.matrix(rep(1., n))
rowmap[1] = ncov
rowmap[2:ncov] = 1:X_nc
} else {
Z_mat = cbind(1, Z_mat)
rowmap[1] = X_nc+1
rowmap[2:(X_nc+1)] = 1:X_nc
rowmap[(X_nc+2):ncov] = (X_nc+2):ncov
}
hn = hn_scale/sqrt(n)
#### prepare analysis #########################################################################################
###############################################################################################################
if (verbose)
{
message("Calling C++ function TwoPhase_MLE0_MEXY")
}
## Ensure every variable is the correct type
if (!is.vector(Y_unval_vec))
{
warning("Y_unval_vec is not a vector!")
}
if (!is.matrix(X_tilde_mat))
{
warning("X_tilde_mat is not a matrix!")
}
if (!is.vector(Y_vec))
{
warning("Y_vec is not a vector!")
}
if (!is.matrix(X_mat))
{
warning("X_mat is not a matrix!")
}
if (!is.matrix(Z_mat))
{
warning("Z_mat is not a matrix!")
}
if (!is.matrix(Bspline_mat))
{
warning("Bspline_mat is not a matrix!")
}
###############################################################################################################
#### analysis #################################################################################################
res = .TwoPhase_MLE0_MEXY(Y_unval_vec, X_tilde_mat, Y_vec, X_mat, Z_mat, Bspline_mat, hn, MAX_ITER, TOL, noSE)
#### analysis #################################################################################################
###############################################################################################################
###############################################################################################################
#### return results ###########################################################################################
res_coefficients[,1] = res$theta[rowmap]
res_coefficients[which(res_coefficients[,1] == -999),1] = NA
res_cov = res$cov_theta[rowmap, rowmap]
res_cov[which(res_cov == -999)] = NA
diag(res_cov)[which(diag(res_cov) < 0)] = NA
res_coefficients[,2] = diag(res_cov)
res_coefficients[which(res_coefficients[,2] > 0),2] = sqrt(res_coefficients[which(res_coefficients[,2] > 0),2])
id_NA = which(is.na(res_coefficients[,1]) | is.na(res_coefficients[,2]))
if (length(id_NA) > 0)
{
res_coefficients[-id_NA,3] = res_coefficients[-id_NA,1]/res_coefficients[-id_NA,2]
res_coefficients[-id_NA,4] = 1-pchisq(res_coefficients[-id_NA,3]^2, df=1)
}
else
{
res_coefficients[,3] = res_coefficients[,1]/res_coefficients[,2]
res_coefficients[,4] = 1-pchisq(res_coefficients[,3]^2, df=1)
}
res_final = list(coefficients=res_coefficients,
sigma=sqrt(res$sigma_sq),
covariance=res_cov,
converge=!res$flag_nonconvergence,
converge_cov=!res$flag_nonconvergence_cov)
return(res_final)
#### return results ###########################################################################################
###############################################################################################################
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.