Nothing
Logit <-
function(my_formula, data=d, filter=NULL, ref_group=NULL,
digits_d=4, text_width=120,
brief=getOption("brief"),
res_rows=NULL, res_sort=c("cooks","rstudent","dffits","off"),
pred=TRUE, pred_all=FALSE, prob_cut=0.5, cooks_cut=1,
X1_new=NULL, X2_new=NULL, X3_new=NULL, X4_new=NULL,
X5_new=NULL, X6_new=NULL,
pdf_file=NULL, width=5, height=5, ...) {
# produce actual argument, such as from an abbreviation, and flag if not exist
res_sort <- match.arg(res_sort)
# a dot in a parameter name to an underscore
dots <- list(...)
if (!is.null(dots)) if (length(dots) > 0) {
change <- c("digits.d", "text.width", "res.rows", "res.sort", "pred.all",
"cooks.cut", "X1.new", "X2.new", "X3.new", "X4.new",
"X5.new", "X6.new", "pdf.file")
for (i in 1:length(dots)) {
if (names(dots)[i] %in% change) {
nm <- gsub(".", "_", names(dots)[i], fixed=TRUE)
assign(nm, dots[[i]])
get(nm)
}
}
}
if (missing(my_formula)) {
cat("\n"); stop(call.=FALSE, "\n","------\n",
"Specify a model by listing it first or set according to: my_formula\n\n")
}
# let deprecated mydata work as default
dfs <- .getdfs()
mydata.ok <- FALSE
if (!is.null(dfs)) {
if ("mydata" %in% dfs && !("d" %in% dfs)) {
d <- mydata
df.name <- "mydata"
mydata.ok <- TRUE
options(dname = df.name)
}
}
if (!mydata.ok) {
df.name <- deparse(substitute(data)) # get name of data table
options(dname = df.name)
}
# if a tibble, convert to data frame
if (any(grepl("tbl", class(data), fixed=TRUE)))
data <- data.frame(data)
max_new <- 6
show_R <- FALSE
explain <- FALSE
old.opt <- options()
on.exit(options(old.opt))
options(width=text_width)
# output
cor <- TRUE
if (brief) {
if (is.null(res_rows)) res_rows <- 0
pred <- FALSE
relate <- FALSE
show_R <- FALSE
}
else relate <- TRUE
# convert character variables to factors (for DV)
n_col <- apply(data, 2, function(x) sum(!is.na(x))) # num values per variable
nu.col <- apply(data, 2, function(x) length(unique(na.omit(x)))) # num unique
fnu.col <- logical(length=ncol(data)) # logical, initial values to FALSE
for (i in 1:ncol(data)) {
if (is.character(data[,i]))
if (nu.col[i] != n_col[i]) fnu.col[i] <- TRUE
}
data[fnu.col] <- lapply(data[fnu.col], as.factor)
pre <- "> "
line <- "--------------------------------------------------------------------\n"
if (!exists(df.name)) {
txtC <- "Function Logit() requires the data exist in a data frame\n"
if (df.name == "d")
txtA <- ", the default data frame name, " else txtA <- " "
txtB1 <- "Either create the data frame, such as with data.frame function,\n"
txtB2 <- " or specify the actual data frame with the parameter: data\n"
txtB <- paste(txtB1, txtB2, sep="")
cat("\n"); stop(call.=FALSE, "\n","------\n",
txtC, "Data frame ", df.name, txtA, "does not exist\n\n", txtB, "\n")
}
nm <- all.vars(my_formula) # names of vars in the model
n.vars <- length(nm)
n.pred <- n.vars - 1
if (!missing(filter)) { # subset rows
r <- eval(substitute(filter), envir=data, enclos=parent.frame())
r <- r & !is.na(r) # set missing for a row to FALSE
if (any(r))
data <- data[r,,drop=FALSE]
}
n.obs <- nrow(data)
collinear <- ifelse (n.pred > 1, TRUE, FALSE)
# check response variable for two values, only 0 and 1 if numeric
is.bin <- TRUE
if (is.factor(data[,nm[1]])) {
if (nlevels(data[,nm[1]]) != 2) is.bin <- FALSE
}
else { # numeric y with values 0 and 1
for (i in 1:n.obs)
if (!is.na(data[i,nm[1]]))
if (data[i,nm[1]]!=0 && data[i,nm[1]]!=1) is.bin <- FALSE
}
if (!is.bin) {
cat("\n"); stop(call.=FALSE, "\n","------\n",
"Response variable: ", nm[1], "\n",
"If numeric, can only have values of 0 or 1.\n",
"If a factor, can only have two levels.\n\n")
}
if ( !is.null(X1_new) && (n.pred) > max_new ) {
cat("\n"); stop(call.=FALSE, "\n","------\n",
"No new data for prediction if more than", max_new,
"predictor variables.\n\n")
}
in.data.frame <- TRUE
for (i in 1:n.vars) {
if (!(nm[i] %in% names(data))) {
cat("\n\n\n>>> Note: ", nm[i], "is not in the data frame.\n")
in.data.frame <- FALSE
}
}
# reduce data to just model variables with no missing data
data <- data[, c(nm), drop=FALSE]
data <- data[complete.cases(data), , drop=FALSE]
n.keep <- nrow(data)
# check for all numeric vars (except Y) in.data.frame <- TRUE
numeric.all <- TRUE
for (i in 2:n.vars) {
if (in.data.frame && !is.numeric(data[1,which(names(data) == nm[i])])) {
cat("\n>>> Note: ", nm[i], "is not a numeric variable.\n")
cat(" Indicator variables are created and analyzed.\n")
numeric.all <- FALSE
}
}
# if not all numeric, construct the indicator variables
if (!numeric.all) {
md <- data.frame(model.matrix(my_formula, data=data))
md[,1] <- data[,nm[1]]
names(md)[1] <- nm[1]
data <- md
rm(md)
nm <- names(data)
n.vars <- ncol(data)
n.pred <- n.vars - 1
my_formula <- DF2formula(data)
# if true integer, then convert from type double to integer
rows <- min(50, nrow(data)) # save some time scanning
fnu.col <- logical(length=ncol(data))
for (i in 1:ncol(data))
if (.is.integer(data[1:rows,i]))
fnu.col[i] <- TRUE
data[fnu.col] <- lapply(data[fnu.col], as.integer) # move to integer
}
# if a single pred, make sure to have a + slope
if (is.bin) {
lv1 <- levels(data[,nm[1]])[1]
lv2 <- levels(data[,nm[1]])[2]
if (n.pred==1) {
if (is.numeric(data[,nm[2]])) {
avg <- tapply(data[,nm[2]], data[,nm[1]], mean, na.rm=TRUE)
if (avg[1] > avg[2]) # switch order 0 and 1 for response
if (is.factor(data[,nm[1]]))
data[,nm[1]] <- factor(data[,nm[1]], levels=c(lv2,lv1))
}
} # end n.pred=1
}
# second level of Y, nm[1], is the default reference group
if (!is.null(ref_group)) {
if (lv1 != ref_group && lv2 != ref_group) {
cat("\n"); stop(call.=FALSE, "\n","------\n",
"Values of response ", nm[1], ": ",
levels(data[,nm[1]])[1], " ", levels(data[,nm[1]])[2], "\n",
"You specified a non-existent value, ref_group = ", ref_group, "\n\n")
}
if (lv2 != ref_group)
data[,nm[1]] <- factor(data[,nm[1]], levels=c(lv2,lv1))
}
# check new.data option for consistency
new.data <- FALSE
if ( (n.pred) <= max_new ) {
for (i in 1:(n.pred)) {
pp <- eval(parse(text=paste("X", toString(i),"_new",sep="")))
if (!is.null(pp)) new.data <- TRUE
}
if (new.data) for (i in 1:(n.pred)) {
pp <- eval(parse(text=paste("X", toString(i),"_new",sep="")))
if (is.null(pp)) {
cat("\n"); stop(call.=FALSE, "\n","------\n",
"Specified new data values for one predictor variable,",
" so do for all.\n\n")
}
}
}
# sort values of the one predictor variable for scatterplot
# so that the prediction/confidence intervals can be drawn
if (n.pred == 1) {
o <- order(data[,nm[2]], decreasing=FALSE)
data <- data[o,]
}
# -----------------------------------------------------------
# logit analysis
# all subsequent analysis done on data in model construct lm.out$model
# this model construct contains only model vars, with Y listed first
lm.out <- glm(my_formula, data=data, family="binomial", ...)
# -----------------------------------------------------------
# n.keep <- nrow(lm.out$model)
if (is.null(res_rows)) if (n.keep < 20) res_rows <- n.keep else res_rows <- 20
if (res_rows == "all")
res_rows <- n.keep # turn off resids with res_rows=0 call
cat("\n")
if (sys.nframe() == 1) { # only accurate if not called from model
cat("Data Frame: ", df.name, "\n\n")
}
for (i in 1:n.vars) {
ind <- i
.varlist(n.pred, ind, nm[i], "Predictor", n.obs, n.keep)
}
cat( "\n\n", " BASIC ANALYSIS", "\n\n")
cat("-- Estimated Model of", nm[1], "for",
"the Logit of Reference Group Membership\n")
# singularity check
coef <- lm.out$coefficients
if (anyNA(coef)) {
bad <- names(coef)[which(is.na(coef))]
cat("\n"); stop(call.=FALSE, "\n","------\n",
"Variable redundant with a prior predictor in the model: ", bad, "\n\n",
"Delete that variable, or corresponding prior redundant variables,\n",
" from the model.\n\n")
}
sm <- summary(lm.out)
sm1 <- sm$coefficients
# conf intervals from standard errors, not confint.glm that requires MASS
ci <- suppressMessages(confint.default(lm.out, level=0.95))
smc <- cbind(sm1, ci)
buf <- 0
for (i in 1:nrow(smc)) {
lng.lbl <- nchar(rownames(smc)[i])
if (lng.lbl > buf) buf <- lng.lbl
}
max.num <- integer(length=0)
for (icol in 1:6) {
max.num[icol] <- 0
for (i in 1:nrow(smc)) {
ln.nm <- nchar(as.character(floor(smc[i,icol]))) + digits_d + 1
if (ln.nm > max.num[icol]) max.num[icol] <- ln.nm
}
if (max.num[icol] < 9) max.num[icol] <- 9
}
est.lbl <- .fmtc("Estimate", max.num[1]+1)
ste.lbl <- .fmtc(" Std Err", max.num[2]+2)
t.lbl <- " z-value"
p.lbl <- " p-value"
lb.lbl <- .fmtc("Lower 95%", max.num[5]+3)
ub.lbl <- .fmtc("Upper 95%", max.num[6]+3)
cat("\n", rep(" ", buf), est.lbl, ste.lbl, t.lbl, p.lbl,
lb.lbl, ub.lbl, sep="", "\n")
for (i in 1:(nrow(smc))) {
rlb <- .fmtc(rownames(smc)[i], buf)
ub <- .fmt(smc[i,6], digits_d, max.num[6])
est <- .fmt(smc[i,1], digits_d, max.num[1])
ste <- .fmt(smc[i,2], digits_d, max.num[2]+1)
tvl <- .fmt(smc[i,3], 3, 8)
pvl <- .fmt(smc[i,4], 3, 8)
lb <- .fmt(smc[i,5], digits_d, max.num[5])
ub <- .fmt(smc[i,6], digits_d, max.num[6])
cat(rlb, est, ste, tvl, pvl, " ", lb, " ", ub, "\n")
}
# odds ratios and 95% CI
OR <- coef(lm.out)
orci <- exp(cbind(OR, ci))
cat("\n\n")
cat("-- Odds Ratios and Confidence Intervals\n\n")
max.num <- 9
OR.lbl <- .fmtc("Odds Ratio", max.num+1)
lb.lbl <- .fmtc("Lower 95%", max.num+3)
ub.lbl <- .fmtc("Upper 95%", max.num+3)
cat(rep(" ", buf+2), OR.lbl, lb.lbl, ub.lbl, sep="", "\n")
for (i in 1:(nrow(orci))) {
rlb <- .fmtc(rownames(orci)[i], buf)
or.est <- .fmt(orci[i,1], digits_d, max.num)
lb <- .fmt(orci[i,2], digits_d, max.num)
ub <- .fmt(orci[i,3], digits_d, max.num)
cat(rlb, " ", or.est, " ", lb, " ", ub, "\n")
}
# model fit
cat("\n\n")
cat("-- Model Fit\n")
cat("\n")
cat(" Null deviance:", .fmt(sm$null.deviance,3), "on",
.fmti(sm$df.null), "degrees of freedom\n")
cat("Residual deviance:", .fmt(sm$deviance,3), "on",
.fmti(sm$df.residual), "degrees of freedom\n\n")
cat("AIC:", sm$aic, "\n\n")
cat("Number of iterations to convergence:", sm$iter, "\n\n")
# collinearity
if (collinear) {
cat( "\n", "Collinearity", "\n", sep="")
cat("\n")
# need to run logistic reg on a numeric of Y to get usual
if (is.factor(data[,nm[1]])) {
Y <- as.numeric(data[,nm[1]])
m.f <- paste("Y ~", nm[2])
if (n.pred > 1)
for (i in 2:n.pred) m.f <- paste(m.f, "+", nm[i+1])
m.d <- data.frame(Y, data, stringsAsFactors=TRUE)
}
else {
m.f <- my_formula
m.d <- data
}
r.out <- lm(as.formula(m.f), data=m.d)
MSW <- anova(r.out)[n.vars,3]
sterrs <- summary(r.out)$coefficients[,2]
vif <- numeric(length = 0)
tol <- numeric(length = 0)
for (i in 1:n.pred) {
v <- var(r.out$model[i+1])
vif[i] <- (v * (n.keep-1) * sterrs[i+1]^2) / MSW
tol[i] <- 1 / vif[i]
}
out <- cbind(tol, vif)
colnames(out) <- c("Tolerance", " VIF")
rownames(out) <- nm[2:length(nm)]
print(round(out,3))
}
if (res_rows > 0)
.logit3Residual(lm.out, nm, df.name,
n.vars, n.pred, n.obs, n.keep, digits_d, pre, line,
res_sort, res_rows, cooks_cut)
if (pred)
.logit4Pred(lm.out, nm, df.name, my_formula, brief, res_rows,
n.vars, n.pred, n.obs, n.keep, digits_d, pre, line,
new.data, pred, pred_all, prob_cut,
in.data.frame, X1_new,
X2_new, X3_new, X4_new, X5_new, X6_new,
pdf_file, width, height)
# Also calls .logit5Confuse.R
return(invisible(lm.out))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.