README.md

R package MetaIntegration

Ensemble Meta-Prediction Framework to Integrate Multiple Regression Models

Overview

This R package is an ensemble meta-prediction framework to integrate multiple regression models into a current study.

Installation

If the devtools package is not yet installed, install it first:

install.packages('devtools')
# install the package from Github:
devtools::install_github('umich-biostatistics/MetaIntegration') 

Once installed, load the package:

library(MetaIntegration)

Example Usage

For this example, we simulate data and test each of the functions.

Simulate data

############## Generating 1 internal dataset of size n ###########################################
# Full model: Y|X1, X2, X3, X4, B
# (X1, X2, X3, X4, B) follows normal distribution with mean zero, variance one and correlation 0.3
# Y|X1, X2, X3, X4, B follows Bernoulli[expit(-1-0.5X+0.5B)], where expit(x)=exp(x)/[1+exp(x)]
##################################################################################################
set.seed(2333)
n = 200
data.n = data.frame(matrix(ncol = 5, nrow = n))
colnames(data.n) = c('Y', 'X1', 'X2', 'X3', 'B')
data.n[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(n, rep(0,4), diag(0.7,4)+0.3)
############# Function 1 #########################################
data.n$Y = rbinom(n, 1, expit(-1 - 0.5*(data.n$X1 + data.n$X2 + data.n$X3) + 0.5*data.n$B))

############# Generate k=3 external models #########################################
# Full model: Y|X1, X2, X3, B
# Reduced external model 1: Y|X1, X2 with sample size m1
# Reduced external model 2: Y|X1, X3 with sample size m2
# Reduced external model 3: Y|X1, X2, X3 with sample size m3
####################################################################################
# generate data from the full model first, then fit the reduced logistic regression 
# on the data to obtain the beta estiamtes and the corresponsing estimated variance 
####################################################################################
m1 = 500
m2 = 2000
m3 = 30000
data.m1 = data.frame(matrix(ncol = 5, nrow = m1))
data.m2 = data.frame(matrix(ncol = 5, nrow = m2))
data.m3 = data.frame(matrix(ncol = 5, nrow = m3))
names(data.m1) = names(data.m2) = names(data.m3) = c('Y', 'X1', 'X2', 'X3', 'B')

data.m1[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m1, rep(0,4), diag(0.7,4)+0.3)
data.m2[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m2, rep(0,4), diag(0.7,4)+0.3)
data.m3[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m3, rep(0,4), diag(0.7,4)+0.3)
data.m1$Y = rbinom(m1, 1, expit(-1 - 0.5*(data.m1$X1 + data.m1$X2 + data.m1$X3) + 0.5*data.m1$B))
data.m2$Y = rbinom(m2, 1, expit(-1 - 0.5*(data.m2$X1 + data.m2$X2 + data.m2$X3) + 0.5*data.m2$B))
data.m3$Y = rbinom(m3, 1, expit(-1 - 0.5*(data.m3$X1 + data.m3$X2 + data.m3$X3) + 0.5*data.m3$B))

#fit Y|X using logistic regression to obtain the external beta estimates
fit.E1 = glm(Y ~ X1 + X2,      data = data.m1, family = binomial(link='logit'))
fit.E2 = glm(Y ~ X1 + X3,      data = data.m2, family = binomial(link='logit'))
fit.E3 = glm(Y ~ X1 + X2 + X3, data = data.m3, family = binomial(link='logit'))

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 1
beta.E1 = coef(fit.E1)
names(beta.E1) = c('int', 'X1', 'X2')
V.E1 = vcov(fit.E1)

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 2
beta.E2 = coef(fit.E2)
names(beta.E2) = c('int','X1','X3')
V.E2 = vcov(fit.E2)

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 3
beta.E3 = coef(fit.E3)
names(beta.E3) = c('int','X1','X2','X3')
V.E3 = vcov(fit.E3)

#Save all the external model information into lists for later use
betaHatExt_list = list(Ext1 = beta.E1, Ext2 = beta.E2, Ext3 = beta.E3)
CovExt_list = list(Ext1 = V.E1, Ext2 = V.E2, Ext3 = V.E3)
rho = list(Ext1 = n/m1, Ext2 = n/m2, Ext3 = n/m3)

fxnCC_LogReg()

Constraint maximum likelihood (CML) method for logistic regression (binary outcome Y).

fit.gamma.I = glm(Y ~ X1 + X2 + X3 + B, data = data.n, family = binomial(link='logit'))
gamma.I = coef(fit.gamma.I)
gamma.CML1 = fxnCC_LogReg(p=3, 
                          q=5, 
                          YInt=data.n$Y, 
                          XInt=cbind(data.n$X1,data.n$X2), 
                          BInt=cbind(data.n$X3,data.n$B), 
                          betaHatExt=beta.E1, 
                          gammaHatInt=gamma.I, 
                          n=n, 
                          tol=1e-8, 
                          maxIter=400,
                          factor=1)[["gammaHat"]]
gamma.CML2 = fxnCC_LogReg(p=3, 
                          q=5, 
                          YInt=data.n$Y, 
                          XInt=cbind(data.n$X1,data.n$X3), 
                          BInt=cbind(data.n$X2, data.n$B), 
                          betaHatExt=beta.E2, 
                          gammaHatInt=c(gamma.I[1:2],gamma.I[4],gamma.I[3],gamma.I[5]), 
                          n=n, 
                          tol=1e-8, 
                          maxIter=400,
                          factor=1)[["gammaHat"]]
gamma.CML2 = c(gamma.CML2[1:2], gamma.CML2[4], gamma.CML2[3], gamma.CML2[5])
gamma.CML3 = fxnCC_LogReg(p=4, 
                          q=5, 
                          YInt=data.n$Y, 
                          XInt=cbind(data.n$X1,data.n$X2,data.n$X3), 
                          BInt=cbind(data.n$B), 
                          betaHatExt=beta.E3, 
                          gammaHatInt=gamma.I, 
                          n=n, 
                          tol=1e-8, 
                          maxIter=400,
                          factor=1)[["gammaHat"]]

asympVar_LogReg()

Asymptotic variance-covariance matrix for gamma_Int and gamma_CML for logistic regression (binary outcome Y).

asy.CML = asympVar_LogReg(k=3, 
                          p=4,
                          q=5, 
                          YInt=data.n$Y, 
                          XInt=data.n[,c('X1','X2','X3')], #covariates that appeared in at least one external model
                          BInt=data.n$B,  #covariates that not used in any of the external models
                          gammaHatInt=gamma.I, 
                          betaHatExt_list=betaHatExt_list, 
                          CovExt_list=CovExt_list, 
                          rho=rho, 
                          ExUncertainty=TRUE)
asyV.I = asy.CML[['asyV.I']]                      #variance of gamma.I
asyV.CML1 = asy.CML[['asyV.CML']][[1]]            #variance of gamma.CML1
asyV.CML2 = asy.CML[['asyV.CML']][[2]]            #variance of gamma.CML2
asyCov.CML1.I = asy.CML[['asyCov.CML.I']][[1]]    #covariance of gamma.CML1 and gamma.I
asyCov.CML2.I = asy.CML[['asyCov.CML.I']][[2]]    #covariance of gamma.CML2 and gamma.I
asyCov.CML12 = asy.CML[['asyCov.CML']][['12']]    #covariance of gamma.CML1 and gamma.CML2

get_gamma_EB()

Calculate the empirical Bayes (EB) estimates.

gamma.EB1 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML1, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB2 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML2, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB3 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML3, asyV.I=asyV.I)[['gamma.EB']]

get_var_EB()

Using simulation to obtain the asymptotic variance-covariance matrix of gamma_EB, package corpcor and MASS are required.

#Get the asymptotic variance of the EB estimates
V.EB = get_var_EB(k=3, 
                  q=5, 
                  gamma.CML=c(gamma.CML1, gamma.CML2, gamma.CML3), 
                  gamma.I = gamma.I,
                  asy.CML=asy.CML, 
                  seed=2333, 
                  nsim=2000)
asyV.EB1 = V.EB[['asyV.EB']][[1]]             #variance of gamma.EB1
asyV.EB2 = V.EB[['asyV.EB']][[2]]             #variance of gamma.EB2
asyV.EB3 = V.EB[['asyV.EB']][[3]]             #variance of gamma.EB3
asyCov.EB1.I = V.EB[['asyCov.EB.I']][[1]]     #covariance of gamma.EB1 and gamma.I
asyCov.EB2.I = V.EB[['asyCov.EB.I']][[2]]     #covariance of gamma.EB2 and gamma.I
asyCov.EB3.I = V.EB[['asyCov.EB.I']][[3]]     #covariance of gamma.EB2 and gamma.I
asyCov.EB12 = V.EB[['asyCov.EB']][['12']]     #covariance of gamma.EB1 and gamma.EB2
asyCov.EB13 = V.EB[['asyCov.EB']][['13']]     #covariance of gamma.EB1 and gamma.EB3
asyCov.EB23 = V.EB[['asyCov.EB']][['23']]     #covariance of gamma.EB2 and gamma.EB3

get_OCW()

Obtain the proposed Optimal covariate-Weighted (OCW) estimates, package Rsolnp is required.

get_OCW(k=3, 
        q=5, 
        data.XB=data.n[,c('X1','X2','X3','B')], 
        gamma.EB=c(gamma.EB1, gamma.EB2, gamma.EB3), 
        V.EB=V.EB)

get_SCLearner()

Obtain the proposed Selective Coefficient-Learner (SC-Learner) estimates.

pred.matrix = matrix(c(1,1,1,0,0,
                       1,1,0,1,0,
                       1,1,1,1,0), 5, 3)
rownames(pred.matrix) = c('int','X1','X2','X3','B')
colnames(pred.matrix) = c('E1','E2','E3')

get_SCLearner(k=3,
              q=5,
              pred.matrix=pred.matrix,
              gamma.EB=cbind(gamma.EB1, gamma.EB2, gamma.EB3),
              V.EB)

Current Suggested Citation

Gu, T., Taylor, J.M.G. and Mukherjee, B. (2020). An ensemble meta-prediction framework to integrate multiple regression models into a current study. Manuscript in preparation.



Try the MetaIntegration package in your browser

Any scripts or data that you put into this service are public.

MetaIntegration documentation built on March 18, 2021, 1:06 a.m.