#----------------------------------------------------------------------------#
#' @title Generate, format and save a tabulation of the most important predictors
#' in a logit model (glm/glmnet/speedglm).
#'
#' @description
#'
#' @export
#' @import data.table
#' @import xlsx
#' @param model
#' @param cluster_var_vector
#' @param output_path
#' @param add_dt
#' @param feat_lim
#' @return
#' @examples
feature_logit <- function(model, cluster_var_vector=NA, feat_lim=300, output_path,
add_dt=NULL, mode="default") {
# stats
norm_factor_0.9 <- 1.96
norm_factor_0.95 <- 1.645
if (is.na(cluster_var_vector[1])) {
if (!(class(model)[[1]] %in% c("cv.glmnet", "glmnet","lognet"))) {
logit_model_output <- summary(model)
coeff_raw <- logit_model_output$coefficients
coeff <- as.data.table(coeff_raw)
coeff[, var_name:=rownames(coeff_raw)]
coeff <- coeff[!var_name %like% "Intercept"]
setnames(coeff, c("estimate", "std", "z", "p", "var_name"))
current_date_time_id <- paste0(gsub("_", "", as.character(format(Sys.time(), "%H_%M_%S__%d_%m_%y"))),
"_", sample(1:40, 1, replace=F))
write.csv(coeff, paste0("feature_update", current_date_time_id, ".csv"), row.names=F)
coeff <- fread(paste0("feature_update", current_date_time_id ,".csv"))
} else {
coeff <- data.table(data.frame(coef.name = dimnames(coef(model))[[1]],
coef.value = matrix(coef(model))))
setnames(coeff, c("var_name", "estimate"))
coeff <- coeff[!var_name %like% "Intercept"]
coeff <- coeff[!estimate==0]
}
} else {
if (!(class(model)[[1]] %in% c("cv.glmnet", "glmnet","lognet"))) {
# automatic (cluster.vcov ---- multiwayvcov package)
#----------------------------------------------------------------------------#
var_cov_adj_f <- cluster.vcov(model, cluster_var_vector)
# none automatic (sandwich)
#----------------------------------------------------------------------------#
# get_CL_vcov <- function(model, cluster) {
# #calculate degree of freedom adjustment
# M <- length(unique(cluster))
# N <- length(cluster)
# K <- model$rank
# dfc <- (M/(M-1))*((N-1)/(N-K))
# # pearson residuals
# predict <- model$fitted.values
# residuals <- model$y - predict
# x_matrix <- copy(model$data[complete.cases(model$data)])
# x_matrix[, c(outcome_var):=NULL]
# x_matrix <- as.matrix(x_matrix)
# est_function_model <- residuals * cbind(1,x_matrix)
# #calculate the uj's
# # uj <- apply(estfun(model),2, function(x) tapply(x, cluster, sum))
# uj <- apply(est_function_model,2, function(x)
# tapply(x, cluster, sum))
# #use sandwich to get the var-covar matrix
# print(dfc)
# vcovCL <- dfc * sandwich_mod(model, meat=crossprod(uj)/N)
# return(vcovCL)
# }
# sandwich_mod <- function (x, bread. = bread, meat. = meat, ...) {
# if (is.list(x) && !is.null(x$na.action))
# class(x$na.action) <- "omit"
# if (is.function(bread.))
# bread. <- bread.(x)
# if (is.function(meat.))
# meat. <- meat.(x, ...)
# n <- NROW(estfun(x))
# # print(str(bread.))
# # print(str(meat.))
# return(1/n * (bread. %*% meat. %*% bread.))
# }
# var_cov_adj_man <- get_CL_vcov(model, data[,get("empi")][complete.cases(data)])
## obtain coefficients,p-value, SE
coeff_std_standard <- coeftest(model)
coeff_std_adj_f <- coeftest(model, var_cov_adj_f)
# coeff_std_adj_man <- coeftest(model, var_cov_adj_man)
## format
coeff <- data.table(var_name=rownames(coeff_std_adj_f),
estimate=coeff_std_adj_f[, 1],
std=coeff_std_adj_f[, 2], p=coeff_std_adj_f[, 4])
coeff <- coeff[!var_name %like% "Intercept"]
} else {
coeff <- data.table(data.frame(coef.name = dimnames(coef(model))[[1]],
coef.value = matrix(coef(model))))
setnames(coeff, c("var_name", "estimate"))
coeff <- coeff[!var_name %like% "Intercept"]
coeff <- coeff[!estimate==0]
}
}
coeff[, estimate:=as.numeric(estimate)]
coeff[, odds:=exp(estimate)]
coeff[, odds:=round(odds, digits=5)]
coeff[, abs_est:=abs(estimate)]
setorder(coeff, -abs_est)
coeff[, abs_est:=NULL]
# ensure that feat_lim appropriate
feat_lim <- min(feat_lim, nrow(coeff))
coeff <- coeff[1:feat_lim]
if (!(class(model)[[1]] %in% c("cv.glmnet", "glmnet","lognet"))) {
coeff[, p:=as.numeric(p)]
coeff[, sign:=ifelse(as.numeric(get("p")) < .001, "**** ",
ifelse(as.numeric(get("p"))< .01, "*** ",
ifelse(as.numeric(get("p")) < .05, "** ",
ifelse(as.numeric(get("p")) < .1, "* ",
" ")))), by=1:nrow(coeff)]
coeff[, CI_0.9_min:=estimate -(norm_factor_0.9 * std)]
coeff[, CI_0.9_max:=estimate + (norm_factor_0.9 * std)]
coeff[, CI_0.95_min:=estimate -(norm_factor_0.95 * std)]
coeff[, CI_0.95_max:=estimate + (norm_factor_0.95 * std)]
coeff[, odds_CI_0.9_min:=exp(CI_0.9_min)]
coeff[, odds_CI_0.9_max:=exp(CI_0.9_max)]
coeff[, odds_CI_0.95_min:=exp(CI_0.95_min)]
coeff[, odds_CI_0.95_max:=exp(CI_0.95_max)]
coeff[, odds_std:=odds*std]
coeff[, p:=round(p, digits=5)]
coeff[, std:=round(std, digits=5)]
coeff[, odds_std:=round(odds_std, digits=5)]
coeff[, odds_CI_0.9_min:=round(odds_CI_0.9_min, digits=5)]
coeff[, odds_CI_0.9_max:=round(odds_CI_0.9_max, digits=5)]
coeff[, odds_CI_0.95_min:=round(odds_CI_0.95_min, digits=5)]
coeff[, odds_CI_0.95_max:=round(odds_CI_0.95_max, digits=5)]
}
coeff[, estimate:=round(estimate, digits=5)]
var_coeff <- c("var_name", 'estimate', "std", "odds","odds_std", "odds_CI_0.9_min", "odds_CI_0.9_max",
"odds_CI_0.95_min", "odds_CI_0.95_max", "p", "sign")
var_coeff_name <- c("var_name", "estimate", "std_clust", "odds", "odds_std_clust",
"odds_CI_0.9_min_clust", "odds_CI_0.9_max_clust",
"odds_CI_0.95_min_clust", "odds_CI_0.95_max_clust", "p_clust", "sign_clust")
var_coeff_name <- var_coeff_name[var_coeff %in% names(coeff)]
var_coeff <- var_coeff[var_coeff %in% names(coeff)]
coeff <- coeff[, mget(var_coeff)]
setnames(coeff, var_coeff_name)
if (mode=="min_sign") {
# NO VARIABLES DROPPED
coeff[, p_sign:=gsub("[0-9\\.]", "", get(grep("p$|p_clust",names(coeff), value=T)))]
coeff[, c(grep("p$|p_clust",names(coeff), value=T)):=as.numeric(gsub("[^0-9\\.]", "", get(grep("p$|p_clust",names(coeff), value=T))))]
if ("odds_CI_0.9_min_clust" %in% names(coeff)) {
coeff[, c("odds_CI_0.9_min", "odds_CI_0.9_max", "odds_CI_0.95_min", "odds_CI_0.95_max"):=NULL]
}
}
# return(coeff)
# write.csv(coeff, paste0(output_folder, output_path),row.names=F)
# prepare wb
wb <- createWorkbook(type="xlsx")
sheet = createSheet(wb, "feat_importance")
# generate feat imp dt
feat_dt <- coeff
num_col <- which(sapply(feat_dt, function(x) class(x)[1]) %in% c("numeric"))
char_col <- which(sapply(feat_dt, function(x) class(x)[1]) %in% c("character", "factor"))
feat_dt[, c(num_col):=lapply(.SD, function(x) round(x, digits=4)), .SDcols=num_col]
# setnames(feat_dt, names(feat_dt)[char_col], "var_name")
feat_dt[, var_cat:=gsub("^([^\\.]*)(\\.).*","\\1",var_name)]
feat_dt[, var_timeframe:=gsub(".*_timeframe_(.*)$","\\1",var_name)]
feat_dt[var_timeframe %like% "days_to_last", var_timeframe:="days_to_last"]
feat_dt[var_timeframe==var_name, var_timeframe:="-"]
# add stats
if(!is.null(add_dt[[1]])) {
add_dt <- add_dt[var_name %in% feat_dt$var_name]
feat_dt <- Reduce(function(x,y) mymerge(x, y, var_list="var_name"),
list(feat_dt, add_dt))
}
# order
if (mode=="default" | class(model)[[1]] %in% c("cv.glmnet", "glmnet","lognet")) {
setorder(feat_dt, -odds)
} else if (mode=="min_sign") {
p_val <- grep("p$|p_clust",names(feat_dt), value=T)
setorderv(feat_dt, c(p_val, "odds"), order=c(1,-1))
}
# fomat and add to xlsx
addDataFrame(feat_dt,sheet, row.names=FALSE, col.names=TRUE)
# cell_style
var_cat <- c("dia", "enc", "prc", "dem", "med", "mic", "lab", "lvs", "ed")
var_cat_col <- brewer.pal(length(var_cat),"Spectral")
inv_mapply(function(cat, col) {
cs = CellStyle(wb) + Border(color=col) + Font(wb, color=col, isBold=TRUE)
rows_raw <- (which(feat_dt$var_cat %like% cat))+1
rows <- getRows(sheet, rows_raw)
cells <- getCells(rows, colIndex = c(char_col))
lapply(names(cells), function(ii)setCellStyle(cells[[ii]],cs))
}, cat=var_cat, col=var_cat_col)
saveWorkbook(wb,output_path)
}
#----------------------------------------------------------------------------#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.