#' Run glm with Lasso prescreening built into boostrapped confidence intervals
#'
#' @param Y Outcome variable (continuous, such as LAZ, or binary, such as diarrhea)
#' @param Ws data frame that includes candidate adjustment covariates to screen
#' @param family GLM model family (gaussian, binomial, poisson, or negative binomial). Use "neg.binom" for Negative binomial.
#' @param pval The p-value threshold: any variables with a p-value from the lielihood ratio test below this threshold will be returned. Defaults to 0.2
#' @param print Logical for whether to print function output, defaults to TRUE.
#'
#' @return
#' Function returns the list of variable names with a likelihood ratio test p-value <0.2 (unless a custom p-value is specified).
#' @export
#'
#' @examples
#'
# data(opposites)
# res<-clusbootglm(SCORE~Time*COG,data=opposites,clusterid=Subject)
# summary(res)
# library(ClusterBootstrap)
# data(opposites)
# res<-clusbootglm(SCORE~Time*COG,data=opposites,clusterid=Subject)
# summary(res)
# data=opposites
# model="SCORE~Time*COG"
#
# clusterid="id"
# family = "binomial"
# B = 50
# confint.level = 0.95
# n.cores = 1
# data=dmat
# clusterid="id"
# Ws=colnames(W)
# forcedW=NULL
# pair=NULL
# #temp!
# B = 10
# confint.level = 0.95
# n.cores = 1
# data=dmat
# clusterid="id"
# Ws=colnames(W)
# forcedW=NULL
# pair=NULL
# #temp!
# B = 10
# confint.level = 0.95
# n.cores = 1
cowboy_glm <- function (data, clusterid="id", Ws, forcedW=NULL, pair=NULL, with.replacement=T,
family = "gaussian", B = 200, confint.level = 0.95, n.cores = 8){
require(rsample)
# tt_cores <- detectCores()
# if (n.cores > tt_cores) {
# message(sprintf("Note: \"n.cores\" was set to %d, but only %d are available. Using all cores.",
# n.cores, tt_cores))
# }
set.seed(12345)
#fit full and prescreened model
bfull <- paste(c("tr",Ws, forcedW,pair), collapse="+")
if(!is.null(Ws)){
prescreened_Ws<-washb_glmnet_prescreen(Y=data$Y,data %>% select(!!(Ws)),family=family)
}else{
prescreened_Ws=NULL
}
b <- paste(c("tr",prescreened_Ws, forcedW,pair), collapse="+")
full_model <-as.formula(paste("Y ~ ",bfull,sep = ""))
model <-as.formula(paste("Y ~ ",b,sep = ""))
res.or <- glm(full_model, family = family, data = data)
res.or.screen <- glm(model, family = family, data = data)
confint.pboundaries = c((1 - confint.level)/2, 1 - (1 - confint.level)/2)
confint.Zboundaries = qnorm(confint.pboundaries)
n <- nrow(data)
p <- length(res.or$coef)
coefs <- matrix(NA, nrow = B, ncol = p)
if(with.replacement){
f=NA
D <- data %>% as_tibble() %>% nest(-id)
bs <- bootstraps(D, times = B)
for(i in 1:B){
set.seed(i)
dboot <- as.tibble(bs$splits[[i]]) %>% arrange(id) %>% unnest(cols = c(data))
if(!is.null(Ws)){
prescreened_Ws<-washb_glmnet_prescreen(Y=dboot$Y, dboot %>% select(!!(Ws)),family=family)
}else{
prescreened_Ws=NULL
}
b <- paste(c("tr",prescreened_Ws, forcedW,pair), collapse="+")
model <-as.formula(paste("Y ~ ",b,sep = ""))
bootcoef <- tryCatch(coef(glm(model, family = family,data = dboot)), error = function(x) rep(as.numeric(NA),p))
coefs[i, which(names(res.or$coef) %in% names(bootcoef))] <- bootcoef
}
}else{
#arguments <- as.list(match.call())
#clusterid <- eval(arguments$clusterid, data)
cluster <- as.character(data[[clusterid]])
clusters <- unique(data[[clusterid]])
Obsno <- split(1:n, cluster)
f = matrix(clusters, length(clusters), B)
ff = matrix(f, prod(dim(f)), 1)
fff = sample(ff)
f = matrix(fff, length(clusters), B)
# if (is.numeric(n.cores) & n.cores > 0) {
# if (n.cores == 1) {
#temp
for(i in 1:B){
set.seed(i)
j <- f[, i]
obs <- unlist(Obsno[j])
dboot=data[obs, ]
table(dboot$Y)
if(!is.null(Ws)){
prescreened_Ws<-washb_glmnet_prescreen(Y=dboot$Y, dboot %>% select(!!(Ws)),family=family)
}else{
prescreened_Ws=NULL
}
b <- paste(c("tr",prescreened_Ws, forcedW,pair), collapse="+")
model <-as.formula(paste("Y ~ ",b,sep = ""))
bootcoef <- tryCatch(coef(glm(model, family = family,data = dboot)), error = function(x) rep(as.numeric(NA),p))
coefs[i, which(names(res.or$coef) %in% names(bootcoef))] <- bootcoef
}
# }
# if (n.cores > 1) {
# cl <- makeCluster(max(min(n.cores, tt_cores, 2)))
# previous_RNGkind <- RNGkind()[1]
# RNGkind("L'Ecuyer-CMRG")
# nextRNGStream(.Random.seed)
# clusterExport(cl, varlist = c("f", "Obsno",
# "model", "family", "data",
# "p", "res.or", "clusbootglm_sample_glm"),
# envir = environment())
# splitclusters <- 1:B
# out <- parSapplyLB(cl, splitclusters, function(x) clusbootglm_sample_glm(f,
# x, Obsno, model, family, data, p, res.or))
# coefs <- t(out)
# stopCluster(cl)
# RNGkind(previous_RNGkind)
# }
# }
}
#XXXXXX Check if this is needed:
invalid.samples <- colSums(is.na(coefs))
names(invalid.samples) <- colnames(coefs) <- names(res.or$coef)
samples.with.NA.coef <- which(is.na(rowSums(coefs)))
sdcoefs <- apply(coefs, 2, sd, na.rm = TRUE)
ci_percentile <- t(apply(coefs, 2, quantile, probs = confint.pboundaries, na.rm = TRUE))
ci_parametric <- cbind(res.or$coef + confint.Zboundaries[1] * sdcoefs, res.or$coef + confint.Zboundaries[2] * sdcoefs)
#get error with bias correction and acceleration interval (for skewed data)
#ci_BCa <- ClusterBootstrap:::confint_BCa(B, invalid.samples, model, data, clusterid, family, coefs, res.or$coef, p, confint.Zboundaries)
ci_BCa <- matrix(NA,1,1)
# rownames(ci_BCa) <- dimnames(ci_parametric)[[1]]
# colnames(ci_BCa) <- dimnames(ci_percentile)[[2]]
rownames(ci_percentile) <- dimnames(ci_parametric)[[1]]
colnames(ci_parametric) <- dimnames(ci_percentile)[[2]]
result <- list(call = match.call(), model = model, family = family,
B = B, coefficients = coefs, data = data, bootstrap.matrix = f,
subject.vector = clusterid, lm.coefs = res.or$coef, boot.coefs = colMeans(coefs, na.rm = TRUE),
boot.sds = sdcoefs, ci.level = confint.level,
percentile.interval = ci_percentile, parametric.interval = ci_parametric,
BCa.interval = ci_BCa, samples.with.NA.coef = samples.with.NA.coef,
failed.bootstrap.samples = invalid.samples)
class(result) <- "clusbootglm"
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.