#' Estimate propensity weights for biased samples
#'
#' Estimate propensity weights for biased samples using information from a representative sample.
#'
#' @param data Combined dataset including the convenience sample and representative sample with a subject
#' identifier, \code{ID}, in the first column, any covariates needed for matching, and an indicator, \code{biased},
#' that is 1 for subjects in the biased sample in the last column. Note that all factors in the dataset
#' should either be binary or of class factor
#' @param weight_model Which propensity weight estimation model should be used. Either \code{"logistic"},
#' \code{"randomForest"}, \code{"CBPS"}, or \code{"entbal"}
#' @param incl_wts_for_rep A logical value indicating whether propensity weights for
#' observations from the representative sample should be included in the output.
#' @param return_model_fit An optional flag which changes the output so that it
#' returns a list of estimated propensity weights and the model fit. This is useful for downstream
#' uncertainty estimates. Note that a model fit is only returned if a logistic propensity weight
#' estimation model is selected (weight_model == "logistic")
#'
#' @return \code{estweight} returns a dataframe containing IDs and the corresponding estimated
#' propensity weights. If the flag return_model_fit == TRUE and the weight_model == "logistic", it instead returns a list where the
#' first element is the dataframe of IDS and weights and the second element is the model fit.
#' @export
#'
#' @examples
#' #' data("mtcars")
#' repsample = mtcars
#' n = nrow(repsample)
#' expit = function(x) {exp(x) / (1 + exp(x))}
#'
#' # Calculate probability of being oversampled
#' repsample$sampprob = expit(.01*(repsample$am*4 + repsample$carb*3
#' + repsample$drat*.9 -repsample$mpg*repsample$disp*.05 + .002*repsample$hp^2 + 80))
#'
#' # draw biased and representative samples
#' b.samp = repsample[sample(1:n, 500, prob = repsample$sampprob, replace = TRUE), ]
#' r.samp = repsample[sample(1:n, 500, replace = TRUE), ]
#'
#' # Create indicator of biased sample membership
#' b.samp$biased = 1; r.samp$biased = 0
#'
#'# Format data to pass to function
#'Xcomb = data.frame(ID = 1:(1000), rbind(b.samp, r.samp))
#'Xfit = Xcomb[,c("ID", colnames(Xcomb)[c(2:8,10:12)], "biased")]
#'
#'# Estimate propensity weights
#'estweight(data = Xfit, weight_model = "logistic")
estweight = function(data, weight_model = "logistic",
incl_wts_for_rep = FALSE,
return_model_fit = FALSE){
# Check inputs
if(!(weight_model %in% c("logistic", "randomForest", "CBPS","entbal"))){
warning("Invalid weight_model input. Will use default logistic weight estimation model")
weight_model = "logistic"
}
# if weight_model != logistic, the model fit can't be returned
# so the return_model_fit will be changed to reflect that
if(weight_model != "logistic"){
return_model_fit = FALSE
}
cols = colnames(data)
if(sum(cols%in%c("ID","biased"))!=2){
stop("Data set up incorrectly. Make sure the data includes columns named 'ID' and 'biased'")
}
if(!is.logical(incl_wts_for_rep)){
stop("incl_wts_for_rep needs to be a logical value")
}
# Identify covariate names and which are factors
cov = cols[!(cols%in%c("ID","biased"))]
if(length(cov)==1){
fs = is.factor(data[,cov])
if(is.factor(data[,cov])){
fact_vars = cov
cont_vars = NULL
} else{
fact_vars = NULL
cont_vars = cov
}
} else{
fs = sapply(data[,cov],is.factor)
fact_vars = cov[fs]
cont_vars = cov[!fs]
}
## Estimate selection weights
if(weight_model == "logistic"){
# Use forward-selection AIC to select a model
# Model scope includes second order terms
form_min = stats::as.formula("biased ~ 1")
form_max = stats::as.formula(paste0("biased ~",paste0(c(cov, paste0("I(",cont_vars,"^2)")),collapse = "+")))
fit_min = stats::glm(formula = form_min, family = stats::binomial, data = data)
forward = stats::step(fit_min,scope=list(lower=form_min,upper=form_max),
direction="forward", trace = 0)
# fit selected model
estwt_form = stats::formula(forward)
estwt_fit = stats::glm(formula = estwt_form, family = stats::binomial, data = data)
prob_bias = stats::fitted(estwt_fit, "response")
htweight_unnorm = (1-prob_bias)/prob_bias
htweight = htweight_unnorm/sum(htweight_unnorm)
} else if(weight_model == "randomForest"){
form_max = stats::as.formula(paste0("factor(biased) ~",paste0(cov,collapse = "+")))
fit_rf = randomForest::randomForest(formula = form_max, data = data, type = "classification")
prob_bias1 = stats::predict(fit_rf,type = "prob")[,"1"]
prob_bias = ifelse(prob_bias1==0,.01,ifelse(prob_bias1==1,.99,prob_bias1)) # deal with est wts of 0 or 1
htweight_unnorm = (1-prob_bias)/prob_bias
htweight = htweight_unnorm/sum(htweight_unnorm)
} else if(weight_model == "CBPS"){
form_max = stats::as.formula(paste0("factor(biased) ~",paste0(c(fact_vars, paste0("poly(",cont_vars,",2)")),collapse = "+")))
fit_cbps = CBPS::CBPS(formula = form_max, data = data, ATT = 2) # Representative sample should be the 'treatment' group
prob_bias = 1 - fit_cbps$fitted.values # want prob c2c (not nhanes)
htweight_unnorm = (1-prob_bias)/prob_bias
htweight = htweight_unnorm/sum(htweight_unnorm)
} else if(weight_model == "entbal"){
form_max = stats::as.formula(paste0("factor(biased) ~",paste0(cov,collapse = "+")))
eb_pars_att = list(exp_type = 'binary',
estimand = 'ATT',
n_moments = 3,
optim_method = 'L-BFGS-B',
verbose = T,
opt_constraints = c(-250,250),
bal_tol = 1e-8,
max_iters = 1000,
which_z = 0) # this will match covariates to those in NHANES-REP
estwts_eb_att = entbal::entbal(form_max, data = data,
eb_pars = eb_pars_att)
htweight = estwts_eb_att$wts
} else{
stop("No sampling weight estimation model selected!")
}
## create dataframe for outcome model
weights = data.frame(ID = data$ID,htweight)
if(!incl_wts_for_rep){
weights = weights[data$biased == 1,]
}
if(return_model_fit){
return(list(weights = weights,
estwt_fit = estwt_fit))
} else{
return(weights)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.