#' @title Create formula
#' @description Creates a formula object which can be passed to many R modeling functions from a vector of variable names.
#' @param y character string, target variable
#' @param xList character vector, predictor variables (Right Hand Side variables) for formula
#' @return formula object
#' @export
#' @examples
#' form <- makeForm('mpg', c('drat', 'wt', 'hp'))
#' summary(lm(form, data=mtcars))
makeForm <- function(y, xList) {
if(as.character(y)[1]=='as.factor') y<-paste(paste(as.character(y), collapse='('), ')', sep='')
form <- (as.formula(paste(y, '~', paste(xList, collapse='+'), sep='')))
return(form)
}
#' @title Edit a formula
#' @description Easy way to add or subtract predictor variables to a formula
#' @param form formula, the formula you will be editing
#' @param sub character vector, predictor variables to subtract (remove) from formula
#' @param add character vector, predictor variables to add to formula
#' @return formula, edited formula object
#' @export
#' @examples
#' form <- as.formula('mpg~cyl+hp+drat+qsec')
#' editForm(form, add=c('wt', 'gear', 'carb'))
#' editForm(form, sub=c('cyl', 'hp', 'qsec', 'variableNotInFormula'))
#' editForm(form, sub=c('cyl', 'hp', 'qsec'), add=c('wt'))
editForm <- function(form, add=NULL, sub=NULL){
eform <- form
X <- unlist(strsplit(as.character(form)[[3]], ' +'))
X <- X[X!='+']
Y <- form[[2]]
if(is.null(sub)==F){
Xsub <- X[!X %in% sub]
eform <- makeForm(Y,Xsub)
}
if(is.null(add)==F){
if('Xsub' %in% ls()) X <- Xsub
Xadd <- c(X, add)
eform <- makeForm(Y,Xadd)
}
return(eform)
}
shuffleForm <- function(form){
eform <- form
X <- unlist(strsplit(as.character(form)[[3]], ' +'))
X <- X[X!='+']
X <- X[sample(1:length(X), replace=F)]
Y <- form[[2]]
shuffledForm <- makeForm(Y, X)
return(shuffledForm)
}
#' @title Variable importance from iteratively shuffled orders of predictor variables
#' @description Runs multiple anova analyses to assess deviance explained by each predictor in shuffled orders, iteratively.
#' Currently using only logistic regression. Could be generalized.
#' @param form formula to be passed to \code{\link{travCount}}.
#' @param df data.frame of data to be used for analysis
#' @param n number of iterations for shuffled ANOVA analysis
#' @param test character string, statistical test to run. Default is 'Chisq'.
#' @return data.frame of results
#' @export
#' @examples
#' form <- as.formula('am~wt+gear+carb+cyl+hp+drat+qsec')
#' shuffleAnova(form, mtcars, n=50)
shuffleAnova <- function(form, df, n=5, test='Chisq'){
# creating a list of all anova analyses (with shuffled order of x variables)
anovaL <- list()
for(i in 1:n){
sform <- shuffleForm(form)
anovaL[[i]] <- anova(glm(sform, data=df, family=binomial(logit)), test=test)
}
# create a summary table from running multiple anova analyses with different subsets of variables
pred <- unique(unlist(lapply(anovaL, function(x) unique(rownames(x)))))
res <- data.frame(xvar=pred)
for(i in pred){
res$deviance_mean[res$xvar==i] <- mean(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
res$deviance_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
res$deviance_sd[res$xvar==i] <- sd(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
res$deviance_min[res$xvar==i] <- min(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
res$deviance_max[res$xvar==i] <- max(unlist(lapply(anovaL, function(x) x[rownames(x)==i,2])))
res$pvalue_median[res$xvar==i] <- median(unlist(lapply(anovaL, function(x) x[rownames(x)==i,5])), na.rm=T)
}
p <- cut(res$pvalue_median, breaks=c(0,.001,.01,.05,.1, 1))
res$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', ''))
res <- res[order(res$deviance_mean, decreasing=T),]
return(res)
}
#' @title Format logistic regression table
#' @description Turns a logistic regression object into a regression table
#' and outsheets it to a csv if you choose
#' @param reg logistic regression object. output from \code{\link{glm}}
#' @param file filepath of ouput. \code{txt} or \code{csv} object.
#' @param xvlab data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name.
#' @param stats include statistics in output
#' @return data.frame of results formatted nicely
#' @export
#' @examples
#' reg <- glm(am~qsec+hp, data=mtcars, family=binomial(logit))
#' logit2tab(reg)
#' longnames <- data.frame(short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am', 'qsec'),
#' long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.', 'Q Seconds'))
#' logit2tab(reg, xvlab=longnames, stats=T)
logit2tab <- function(reg, file=NULL, xvlab=NULL, stats=F){
tab <- summary(reg)$coefficients
tab <- data.frame(coef=tab[,'Estimate'], oddsRatio=exp(tab[,'Estimate']), stdError=tab[,'Std. Error'], p=tab[,'Pr(>|z|)'], stringsAsFactors=F)
p <- cut(tab$p, breaks=c(0,.001,.01,.05,.1, 1))
tab$pstar <- factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', ''))
if(is.null(xvlab)==F) {
tab$Metric <- as.character(xvlab[match(rownames(tab), xvlab[,1]), 2])
tab$Metric[is.na(tab$Metric)] <- ''
tab$Metric[rownames(tab)=='(Intercept)'] <- '(Intercept)'
tab <- tab[, c('Metric', names(tab)[!names(tab) %in% 'Metric'])]
}
if(is.null(file)==F) write.csv(tab, file=file)
if(stats==T){
tab[nrow(tab)+1, ] <- ''; rownames(tab)[nrow(tab)] <- ''
tab[nrow(tab)+1,1:2] <- c('aic', reg$aic)
tab[nrow(tab)+1,1:2] <- c('n', nrow(reg$model))
tab[nrow(tab)+1,1:2] <- c('Chi Sq.', reg$null.deviance - reg$deviance)
tab[nrow(tab)+1,1:2] <- c('P(>|Chi|)', pchisq(reg$null.deviance-reg$deviance, 2, lower.tail=F, df=reg$df.null-reg$df.residual))
tab[is.na(tab)] <- ''
}
return(tab)
}
#' @title All-subsets logistic regression
#' @description Plots the output of the bestglm BestModels object. Similar to the visual output of plotting a regsubsets object
#' @param bglm \code{bestglm} object from \code{\link{bestglm}}
#' @param rc decimal places to display on y-axis of plot
#' @return plot
#' @seealso \code{\link{bestglm}}, \code{\link{leaps}}
#' @import bestglm
#' @export
#' @examples
#' require('bestglm')
#' b <- bestglm(Xy=mtcars[,c('mpg', 'hp', 'drat', 'cyl', 'wt', 'qsec', 'vs')], family=binomial(logit), IC='BIC', nvmax=4)
#' plotBestglm(b$BestModels, rc=3)
plotBestglm <- function(bglm, rc=2) {
image(as.matrix(t(bglm[,1:ncol(bglm)-1])), col=c(0:1), xaxt='n', yaxt='n')
axis(1, at=seq(0,1,length.out=ncol(bglm)-1), labels=colnames(bglm)[1:ncol(bglm)-1], las=2)
axis(2, at=seq(0,1,length.out=nrow(bglm)), labels=rev(round(bglm$Criterion,rc)), las=2)
}
#' @title Univariate glm regression
#' @description Runs a univariate logistic regression on each predictor variable of interest.
#' @param df data.frame with variables for analysis
#' @param yv character string, target variable
#' @param xv character vector, predictor variables to test univariately
#' @param file character string, filepath to write results out to. txt or csv file.
#' @param sortby character string, criteria to sort variables by. Default = 'aic'
#' @param xvlab data.frame, lookup table for variable names. First column is codename, second column is the pretty printed name.
#' @param test \code{TRUE} or \code{FALSE}. Includes Chi square test, or not.
#' @return data.frame of results
#' @seealso \code{\link{bestglm}}, \code{\link{leaps}}
#' @import Hmisc
#' @export
#' @examples
#'
#' require('Hmisc')
#'
#' ##setting up some data
#' longnames <- data.frame(long = c('Weight', 'Miles Per Gallon', 'Cylinder', 'D.R.A.T', 'Horsepower', 'A.M.'),
#' short = c('wt', 'mpg', 'cyl', 'drat', 'hp', 'am'), stringsAsFactors=F)
#'
#' glm.out <- uniglm(df=mtcars, yv='vs', xv=c('hp','drat','cyl','mpg','wt'), xvlab=longnames)
uniglm <- function(df, yv, xv, file=NULL, sortby='aic', xvlab=NULL, test=T){
mat <- data.frame(matrix(nrow=25*length(xv), ncol=12, dimnames=list(NULL, c('Predictor', 'Name', 'fac', 'ref', 'coef', 'oddsRatio', 'p', 'pstar', 'aic', 'c', 'Chisq_pvalue', 'Chisq_pstar'))))
i<-0
for(x in xv){
formt <- makeForm(yv, x)
reg <- glm(formt, data=df, family=binomial(logit))
for(j in 2:length(reg$coefficients)){
i<-i+1
try({
mat$Predictor[i] <- names(reg$coefficients[j])
## If we have nice clean labels to replace variable names in code
if(is.null(xvlab)==F) {mat$Name[i] <- as.character(xvlab[match(x, xvlab[,2]), 1])
} else {mat=mat[, setdiff(names(mat), 'Name')]}
## If we have a categorical variable with a reference
if(length(reg$coef)>2) {
regv <- gsub(x, '', names(reg$coef))
uxv <- unique(df[,x])
mat$ref[i] <- setdiff(uxv, regv)
mat$Predictor[i] <- x
mat$fac[i] <- gsub(x, '', names(reg$coef[j]))
}
if(test==T){
av <- anova(reg, test='Chisq')['Pr(>Chi)'][[1]][2]
mat$Chisq_pvalue[i] <- av
avp <- cut(mat$Chisq_pvalue[i], breaks=c(0,.001,.01,.05,.1, 1))
mat$Chisq_pstar[i] <- as.character(factor(x=avp, levels=levels(avp), labels=c('***', '**', '*', '.', '')))
} else {mat=mat[, setdiff(names(mat), c('Chisq_pvalue', 'Chisq_pstar'))]}
mat$coef[i] <- as.numeric(reg$coefficients[j])
mat$oddsRatio[i] <- as.numeric(exp(reg$coefficients[j]))
mat$p[i] <- as.numeric(summary(reg)$coefficients[j,4])
p <- cut(mat$p[i], breaks=c(0,.001,.01,.05,.1, 1))
mat$pstar[i] <- as.character(factor(x=p, levels=levels(p), labels=c('***', '**', '*', '.', '')))
mat$aic[i] <- reg$aic
mat$c[i] <- round(somers2(x=reg$fitted.values, y=reg$y)['C'],3)
})
}
}
mat<-mat[1:i,]
mat$ref[is.na(mat$ref)] <- ''
mat$fac[is.na(mat$fac)] <- ''
if(sum(mat$ref=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'ref')]
if(sum(mat$fac=='')==nrow(mat)) mat <- mat[, setdiff(names(mat), 'fac')]
decreasingTF <- ifelse(sortby %in% c('c'), T, F)
mat <- mat[order(mat[,sortby], decreasing=decreasingTF),]
if(is.null(file)==F) write.csv(mat, file=file)
return(mat)
}
#' @title Analyze predictions of supervised model in quantiles
#' @description Analyzing results of supervised model using test data and quantiles. Note only
#' \code{model} or \code{testPred} are necessary, not both. \code{testPred} is advised.
#' @param model the model to get predictions from. Uses \code{predict} method. If \code{testPred}
#' is specified, we don't need this parameter, Probably safer to just use testPred, unless the model you have
#' has a predict method that will work as intended without any other arguments.
#' @param xtext data.frame of test data (predictor variables only)
#' @param ytest vector of test data (target variable)
#' @param n number of quantiles
#' @param roundText decimals to print
#' @param testPred vector of predictions from test data. Do not use if using \code{model} parameter.
#' @param fw numeric vector, of bins. Defaults to
#' @return data.frame of results
#' @seealso \code{\link{predSortPlot}}
#' @export
#' @examples
#' require('randomForest')
#' rf <- randomForest(x=mtcars[1:25,1:8], y=as.factor(mtcars[1:25, 'am']), ntree=5)
#' mtcarsTestPred <- predict(rf, mtcars[26:32, 1:8], type='prob')[,2]
#' pq2 <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], n=3, roundText=4, testPred=mtcarsTestPred)
#' pq <- predQuantile(xtest=mtcars[26:32, 1:8], ytest=mtcars[26:32, 'am'], fw=seq(.1,1,length.out=10), roundText=4, testPred=mtcarsTestPred)
#'
#' barplot(pq$actual, names=pq$predRange, ylab='True Positives', las=2, cex.names=.7)
#' barplot(pq$hitRate, names=pq$predRange, ylab='True Positive Hit Rate', las=2, cex.names=.7)
#' barplot(pq$predMax-pq$predMin, pq$hitRate, names=pq$predRange, ylab='prediction', las=2, cex.names=.7)
predQuantile <- function(model=NULL, xtest=NULL, ytest, n=5, roundText=3, testPred=NULL, fw=NULL){
if(is.null(testPred)) {pred <- predict(model, xtest)
} else {pred <- testPred}
tf <- data.frame(pred=pred, hits=ytest)
tf <- tf[order(tf$pred),]
if(is.null(fw)) {tf$cat <- sort(rep(1:n, ceiling(nrow(tf)/n))[1:nrow(tf)])
} else {tf$cat <- cut(tf$pred, breaks=fw)}
aggSum <- aggregate(hits~cat, data=tf, sum)
aggLength <- aggregate(hits~cat, data=tf, length)
aggRange <- aggregate(pred~cat, data=tf, range)
aggRange <- cbind(aggRange[,1], data.frame(aggRange[,2]))
aggRangeText <- aggregate(pred~cat, data=tf, function(x) paste(round(range(x), roundText), collapse=' to '))
names(aggLength)[2] <- 'N'
names(aggRange) <- c('cat', 'predMin', 'predMax')
names(aggRangeText)[2] <- 'predRange'
agg <- merge(aggSum, aggLength, by='cat')
agg$hitRate <- agg$hits/agg$N
agg <- merge(agg, aggRange, by='cat')
agg <- merge(agg, aggRangeText, by='cat')
guessrate <- sum(ytest==1)/length(ytest)
agg$cumHitsPct <- rev(cumsum(rev(agg$hits)/sum(agg$hits)))
agg$cumHitRate <- rev(cumsum(rev(agg$hits))/cumsum(rev(agg$N)))
agg$cumNPct <- rev(cumsum(rev(agg$N)/sum(agg$N)))
agg$cumLift <- agg$cumHitRate/guessrate
##correcting for cutoffs with zero
if(!is.null(fw)) {
a<-paste(levels(tf$cat)[!levels(tf$cat) %in% tf$cat], collapse=', ')
print(paste('no scores in these ranges',a))
agg$cutoff <- fw[which(levels(tf$cat) %in% tf$cat)]
}
agg <- agg[nrow(agg):1,]
return(agg)
}
#' @title Devil's Horn
#' @description Visualize results of supervised model predictions on test data. Currently supports binary target variable.
#' Red dots represent observations where target variable = 1, black dots where target variable = 0.
#' @param pred vector of predictions for target variable on test data
#' @param ytest, vector of the target variable from test data. (0s and 1s)
#' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}
#' @return plot
#' @seealso \code{\link{predQuantile}}
#' @export
#' @examples
#' ## Setting up some data and building a basic model on training data.
#' mylogit <- glm(vs~drat+hp+mpg, family=binomial('logit'), data=mtcars[1:25,])
#' mtcarsTestPred <- predict(mylogit, mtcars[26:32, ], type='response')
#' predSortPlot(pred=mtcarsTestPred, ytest=mtcars$vs[26:32])
predSortPlot <- function(pred, ytest, jitterPlot=NULL) {
tf <- data.frame(pred=pred, actual=ytest)
tf <- tf[order(tf$pred),]
#return(tf)
if(is.null(jitterPlot)==T) plot(tf$pred, col=ifelse(tf$actual==1, 'red', 'black'), cex=ifelse(tf$actual==1, 3, 2), ylab='Prediction')
if(is.null(jitterPlot)==F) plot(jitter(tf$pred, jitterPlot), col=ifelse(tf$actual==1, 'red', 'black'), pch=ifelse(tf$actual==1, 19, 21), ylab='Prediction')
}
#' @title Optimize weight on ensemble of 2 supervised models
#' @description This function creates a weighted average of predictions from two models
#' and evaluates F1, precision, recall, auc or c for each combination of the
#' models to determine the best weights for each.
#' @param pred1 numeric vector of probabilities, prediction from model 1
#' @param pred2 numeric vector of probabilities, prediction from model 2
#' @param actual vector of 1s and 0s. The target variable test data
#' @param steps number, high numbers compute a more exhaustive combination of model weights
#' @param cutoff Cutoff used to demarcate predictions into positive or negative class.
#' @param ytest, vector of the target variable from test data. (0s and 1s)
#' @param jitterPlot \code{TRUE} or \code{FALSE}. Jitters points on plot when \code{TRUE}
#' @return data.frame of results
#' @import Hmisc
#' @export
#' @examples
#' require('Hmisc')
#' fit_glm1 <- glm(am~cyl, data=mtcars, family=binomial(logit))
#' fit_glm2 <- glm(am~disp, data=mtcars, family=binomial(logit))
#' ow <- optimizeModelWeight(fit_glm1$fitted.values, fit_glm2$fitted.values, actual=fit_glm1$model$am)
#' plot(ow$weights, ow$precision, type='l', xlab='weight on model 1')
optimizeModelWeight <- function(pred1, pred2, actual, steps=50, cutoff=.5) {
s <- seq(0,1,length.out=steps)
df <- data.frame(weights=s, c=NA, precision=NA, recall=NA, f1=NA, f.5=NA, stringsAsFactors=F)
j <- 1
for(i in s){
mds <- (i)*pred1 + (1-i)*pred2
df$c[j] <- somers2(mds,actual)['C']
df$precision[j] <- sum(actual[mds>cutoff])/(length(actual[mds>cutoff]))
df$recall[j] <- sum(actual[mds>cutoff])/(sum(actual))
df$f1[j] <- (2*df$precision[j]*df$recall[j])/(df$precision[j]+df$recall[j])
df$f.5[j] <- ((1+.5^2)*df$precision[j]*df$recall[j])/(((.5^2)*df$precision[j])+df$recall[j])
j<-j+1
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.