library(devtools)
library(roxygen2)
library(Hmisc)
library(psych)
library(tidyverse)
library(skimr)
library(purrr)
library(tidyr)
library(tidyverse)
library(gridExtra)
library(lubridate)
library(fastDummies)
library(data.table)
library(mltools)
library(MASS)
library(car)
library(patchwork)
library(ggthemes)
library(tinytex)
library(stats)
library(ggsci)
library(scales)
library(naniar)
#library(Amelia)
library(caret)
library(pROC)
library(methods)
EHTheme <- function(rectfill="cornsilk"){
x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkblue"))
return (x)
}
EHTheme_SlateGray2 <- function(rectfill="slategray2"){
x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray"))
return (x)
}
EH_Theme_Histogram <- function(font_size=7, hist_nbins=30){
#Example of Usage:
# ggplot(dfErrors, aes(x=residuals)) +
# ggtitle("Distribution of Residuals for Decision Tree") +
# q$geom_histogram +
# q$theme_histogram +
# q$density_Histogram
theme_histogram <- theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))
geom_histogram <- geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density)))
density_histogram <- geom_density(col = "red")
newList <- list("theme_histogram" = theme_histogram, "geom_histogram" = geom_histogram, "density_Histogram" = density_histogram)
return(newList)
}
EHModel_ChiSquare <- function(df, column1, column2, print="Nothing")
{
library(kableExtra)
t <- table(df[,column1], df[,column2])
test <- chisq.test(t)
if (print=="table")
{
kableExtra::kable(t)
}
if (print=="result")
{
test
}
if (print=="both")
{
kableExtra::kable(t)
test
}
xlist=list(t, test)
return (xlist)
}
#' @exportClass EH_SummarizeData
EH_SummarizeData <- setRefClass("EH_Summarize", fields = list(df = "data.frame",
font_size = "numeric", y = "character"), methods = list(
StandardPlots = function()
{
EHSummarize_StandardPlots(df, y)
},
Histogram = function()
{
EHSummarize_SingleColumn_Histograms(df)
}
))
EHSummarize_MissingValues <- function(df)
{
library(naniar)
#1. Missing Completely at Random (MCAR):
#2. Missing at Random (MAR):
#3. Missing Not at Random (MNAR)
list12 = list()
list12[[1]] <- gg_miss_var(df)
list12[[2]] <- vis_miss(df)
list12[[3]] <- gg_miss_upset(df)
return(list12)
}
EHPrepare_MissingValues_Imputation <- function(df, y="", impute = "mean", print_all = FALSE)
{
#1. Missing Completely at Random (MCAR):
#2. Missing at Random (MAR):
#3. Missing Not at Random (MNAR)
dfImputedMean <- df
for(i in colnames(df))
if(is.numeric(df[,i])){
meanv <- mean(df[,i], na.rm = TRUE)
dfImputedMean[,i][is.na(df[,i])] <- meanv
}
dfImputedMedian <- df
for(i in colnames(df))
if(is.numeric(df[,i])){
medianv <- median(df[,i], na.rm = TRUE)
dfImputedMedian[,i][is.na(df[,i])] <- medianv
}
if(y==""){
if(impute=="mean"){
return(dfImputedMean)
} else if (impute=="median"){
return(dfImputedMedian)
}
}
dfOmit <- na.omit(df)
fla <- substitute(n ~ ., list(n = as.name(y)))
m1 <- lm(fla, dfImputedMean)
step1 <- stepAIC(m1, trace=FALSE)
s1 <- summary(step1)$adj.r.squared
fla2 <- substitute(n ~ ., list(n = as.name(y)))
m2 <- lm(fla2, dfImputedMedian)
step2 <- stepAIC(m2, trace=FALSE)
s2 <- summary(step2)$adj.r.squared
fla3 <- substitute(n ~ ., list(n = as.name(y)))
m3 <- lm(fla3, dfOmit)
step3 <- stepAIC(m3, trace=FALSE)
s3 <- summary(step3)$adj.r.squared
l1 <- vector(mode = "list", length = 5)
names(l1) <- c("df", "type", "r2mean", "r2median", "r2omit")
l1$r2mean = s1
l1$r2median = s2
l1$r2omit = s3
if (impute == "mean") {
l1$type = "mean"
l1$df=dfImputedMean
}
else if (impute == "median") {
l1$type = "median"
l1$df=dfImputedMedian
}
else if (impute == "omit") {
l1$type = "omit"
l1$df=dfOmit
}
print(c("type:", l1$type))
print(c("r2mean:", round(l1$r2mean,4)))
print(c("r2median:", round(l1$r2median,4)))
print(c("r2omit", round(l1$r2omit,4)))
if (print_all) {
print(summary(step1))
print(summary(step2))
print(summary(step3))
}
return (l1$df)
}
EHExplore_Interactions_Scatterplots <- function(df, y, interaction, rectfill="lightskyblue") {
#If you get these Errors:
#Error: Unknown input: tbl_df' = you probably did not pass it a proper dataframe (probably a tibble instead)
library(ggsci)
df <- as.data.frame(df)
df <- select_if(df, is.numeric)
v <- as.vector(df[,interaction])
xtext1 = as.data.frame(aggregate(data.frame(count = v), list(value = v), length))
df[interaction][df[interaction] == "0"] <- paste0("0 (n=", xtext1$count[1], ")")
df[interaction][df[interaction] == "1"] <- paste0("1 (n=", xtext1$count[2], ")")
df[,interaction] <- as.factor(df[,interaction])
plot_list <- list()
for(i in 1:ncol(df)) {
p <- eval(substitute(ggplot(df, aes_string(df[ , i], y, color=interaction)) +
geom_point(alpha=.1) +
geom_smooth(method = "lm") +
xlab(colnames(df)[i]) +
theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_line(color="gray"), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
scale_color_d3()+
scale_fill_d3()+
ggtitle(colnames(df)[i]), list(i=i)))
plot_list[[i]] <- p
}
return(plot_list)
}
EHSummarize_SingleColumn_BarCharts1 <- function(df, font_size=7, rectfill="slategray2")
{
dfBar2<-data.frame(lapply(df,factor))
plot_list2 <- list()
for(i in 1:ncol(df)) {
dfBar3 <- dfBar2 %>%
dplyr::group_by(dfBar2[,i]) %>%
dplyr::summarise(Count = n())
dfBar3 <- as.data.frame(dfBar3) |>
dplyr::rename(Selection = 1)
p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count, fill=Selection)) +
geom_col() +
scale_color_brewer(type = "div", palette = 8)+
scale_fill_brewer(type = "div", palette = 8)+
theme(legend.position="none") +
ggtitle(colnames(df)[i]) +
theme(title = element_text(size =(font_size)), axis.title.x = element_blank(), axis.title.y = element_text(size = font_size), axis.text.x = element_text(size = font_size, angle=30, vjust=.5), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3)) +
geom_text(aes(label = Count), size=(3), fontface="bold", color="black",
vjust = 1), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
EHSummarize_SingleColumn_BarCharts2 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2")
{
dfBar2<-data.frame(lapply(df,factor))
plot_list2 <- list()
for(i in 1:ncol(df)) {
dfBar3 <- dfBar2 %>%
dplyr::group_by(dfBar2[,i]) %>%
dplyr::summarise(Count = n())
dfBar3 <- as.data.frame(dfBar3) |>
dplyr::rename(Selection = 1)
if (decreasingOrder){
dfBar3$Selection <- factor(dfBar3$Selection,
levels = dfBar3$Selection[order(dfBar3$Count)])
}
p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
coord_flip() +
geom_col(color="black", size=.1, fill="ivory", width=.7) +
theme(legend.position="none") +
ggtitle(colnames(df)[i]) +
theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = font_size), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
EHSummarize_SingleColumn_BarCharts3 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2", title="")
{
dfBar2<-data.frame(lapply(df,factor))
plot_list2 <- list()
for(i in 1:ncol(df)) {
dfBar3 <- dfBar2 %>%
dplyr::group_by(dfBar2[,i]) %>%
dplyr::summarise(Count = n())
dfBar3 <- as.data.frame(dfBar3) |>
dplyr::rename(Selection = 1)
if (decreasingOrder){
dfBar3$Selection <- factor(dfBar3$Selection,
levels = dfBar3$Selection[order(dfBar3$Count)])
}
p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
coord_flip() +
geom_col(color="black", size=.1, fill="ivory", width=.7) +
theme(legend.position="none") +
ylab(colnames(df)[i]) +
ggtitle(title) +
theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_text(size = font_size), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
EHSummarize_SingleColumn_Countplots <- function(df, font_size=7, rectfill="slategray2", title="", decreasingOrder=TRUE)
{
df <- df |> dplyr::select(is.character|is.factor)
df <- as.data.frame(unclass(df), stringsAsFactors = TRUE)
plot_list2 <- list()
for(i in 1:ncol(df)) {
ColName <- colnames(df)[i]
df2 <- df |>
group_by(df[,i]) |>
dplyr::summarize(Count=n()) |>
dplyr::rename_at(1, ~ColName)
if (decreasingOrder){
#order hasn't been worked out
}
p <- eval(substitute(ggplot(df2, aes_string(x=ColName, y="Count")) +
coord_flip() +
xlab(ColName) +
ggtitle(title) +
theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill)) +
geom_bar(color="black", fill="white", stat="identity") +
geom_text(aes(label = Count), size=3, hjust = -.1), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
EHSummarize_SingleColumn_Boxplots <- function(df, font_size=7)
{
df <- select_if(df, is.numeric)
plot_list2 <- list()
for(i in 1:ncol(df)) {
qp <- toString(head(sort(round(df[,i],2)),5))
qz <- toString(tail(sort(round(df[,i],2)),5))
qk <- str_c("L: ", qp, "\\\n", "H: ", qz)
qk <- gsub('\\\\','', qk)
p <- eval(substitute(ggplot(df, aes(df[,i])) +
coord_flip() +
xlab(colnames(df)[i]) +
ylab(qk) +
theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
geom_boxplot(), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
#dfTrain <- read.csv("C:\\Users\\erico\\Documents\\R\\CUNY_621\\Baseball\\moneyball-training-data.csv", header=TRUE)
#dfTrain <- dfTrain %>%
# mutate(xq = ifelse(TEAM_PITCHING_H >1500, 1, 0))
#EHExplore_Correlations_Boxplots(dfTrain, "xq")
EHSummarize_SingleColumn_Histograms <- function(df, font_size = 7, hist_nbins = 20)
{
df <- select_if(df, is.numeric)
plot_list2 <- list()
for(i in 1:ncol(df)) {
qp <- toString(head(sort(round(df[,i],2)),5))
qz <- toString(tail(sort(round(df[,i],2)),5))
qk <- str_c("L: ", qp, "\\\n", "H: ", qz)
qk <- gsub('\\\\','', qk)
p <- eval(substitute(ggplot(df, aes(df[,i])) +
ylab(colnames(df)[i]) +
xlab(qk) +
theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density))) +
geom_density(col = "red"), list(i=i)))
plot_list2[[i]] <- p
}
return (plot_list2)
}
EHExplore_TwoContinuousColumns_Scatterplots <- function(df, y, flip=FALSE, rectfill="slategray2", pointfill="white")
{
plot_list <- list()
df=as.data.frame(df)
df <- select_if(df, is.numeric)
for(i in 1:ncol(df)) {
ct <- cor.test(df[,i], df[,y])
xText <- str_c("Correlation: ", round(ct$estimate,2), " p value: ", round(ct$p.value,2))
x1 = df[[i]]
y1 =y
if(flip)
{
x1=y
y1=df[[i]]
}
p <- ggplot(df, aes_string(x1, y1)) +
geom_point(fill="navy", color=pointfill) +
geom_smooth(method = "loess", color="red", fill="lightcoral") +
ylab(y) +
xlab(xText) +
theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
ggtitle(colnames(df)[i])
p <- eval(substitute(p, list(i=i)))
plot_list[[i]] <- p
}
return(plot_list)
}
EHExplore_TwoContinuousColumns_CorrelationsAndPValues <- function(df, y)
#Also works for one continuous and one binary
# Error in cor.test.default(df[, i], df[, y]) : 'x' must be a numeric vector may occur if df is not recognized as a dataframe - even when it is derived from another dataframe
{
df <- as.data.frame(df)
df <- select_if(df, is.numeric)
dfResult <- data.frame()
for(i in 1:ncol(df)) {
ct <- cor.test(df[,i], df[,y])
#print (df[[i]])
rw <- c(colnames(df)[i], round(ct$estimate,2), round(ct$p.value,2))
dfResult <- rbind(dfResult, rw)
}
colnames(dfResult) = c("column", "correlation", "p")
dfResult <- dfResult |>
dplyr::arrange(p)
return(dfResult)
}
EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots <- function(df, y, yCategorical=TRUE, rectfill="slategray2")
{
plot_list3 <- list()
#At this point, y has to be categorical, the only one and the last one
#Error: Error in parse(text = x, keep.source = FALSE) : <text>:1:12: unexpected symbol may result if the previous rule is violated
zz <- ncol(df) - 1
for(i in 1:zz) {
x1 = df[[i]]
y1 =y
p <- ggplot(df, aes_string(x1, y1, fill=y1)) +
#xlab(colnames(df)[i]) +
#ylab(xText) +
theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
scale_color_d3()+
scale_fill_d3()+
theme(legend.position = "none") +
ggtitle(colnames(df)[i]) +
geom_boxplot()
plot_list3[[i]] <- eval(substitute(p, list(i=i)))
}
plot_list3[1]
return(plot_list3)
}
EHSummarize_StandardPlots <-function(data, y, return_list = FALSE, h_nbins = 20, print=TRUE, type="scatter")
{
#Error - ! Can't subset columns past the end. - may mean you passed a tibble, not a dataframe
list1 <- EHSummarize_SingleColumn_Boxplots(data)
list2 <- EHSummarize_SingleColumn_Histograms(data, hist_nbins = h_nbins)
if(type=="scatter"){
list3 <- EHExplore_TwoContinuousColumns_Scatterplots(data, y)
} else if (type=="box"){
list3 <- EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots(data, y)
}
zz2 <- list()
for(i in 1:length(list1)) {
zz2[i*3-2] <- list1[i]
zz2[i*3-1] <- list2[i]
zz2[i*3] <- list3[i]
}
if (print) {
lenZ <- length(zz2)
quotient <- lenZ %/% 9
gap <- lenZ - quotient*9
gaprows <- gap/3
if (lenZ>=9) {
for(i in 1:quotient) {
start <- (i-1)*9 + 1
finish <- start + 8
grid.arrange(grobs=zz2[c(start:finish)], ncol=3)
}
}
if (gaprows>0) {
start <- quotient*9 + 1
finish <- start + gaprows*3 - 1
grid.arrange(grobs=zz2[c(start:finish)], ncol=3, nrow=gaprows)
}
}
if (return_list) {
return (zz2)
}
}
EHExplore_Multicollinearity <-function(df, printCorrs=FALSE, printHeatMap = TRUE, printHighest=FALSE, threshold=.85, title="Heatmap for Multicollinearity Analysis") {
#To print out only what you want, set the function to a variable, i.e. x <- EHExplore_Multicollinearity
#If you see: Error in if ((mult2[i, j] > threshold | mult2[i, j] < -1 * threshold) & : missing value where TRUE/FALSE needed it means there are missing values
dfCor <- as.data.frame(cor(df))
library(corrplot)
my_matrix <- df[]
cor_res <- cor(my_matrix, use = "na.or.complete")
if (printCorrs) {
print(dfCor)
}
if (printHeatMap) {
my_matrix <- df[]
cor_res <- cor(my_matrix, use = "na.or.complete")
z <- corrplot(cor_res, title = title, mar=c(0,0,2,0),
diag=FALSE, type = "upper", order = "original", tl.col = "black", tl.srt = 45, tl.cex = 0.55)
}
dfmm <- data.frame(col1=character(),
col2=character(),
correlation=double())
mult2 <- as.data.frame(dfCor)
for(i in 1:ncol(mult2)) { # for-loop over columns
for(j in 1:nrow(mult2)) {
if((mult2[i,j] >threshold | mult2[i,j] < -1*threshold) & mult2[i,j] != 1){
v <- c(colnames(mult2[i]), colnames(mult2[j]), mult2[i,j])
dfmm <- rbind(dfmm, data.frame(col1 =colnames(mult2[i]), col2 = colnames(mult2[j]), correlation=
mult2[i,j], stringsAsFactors = FALSE))
}
}
}
if (nrow(dfmm)>0){
nrow1 <- nrow(dfmm)/2
for (j in 1:nrow1){
cl1 <- dfmm[j,1]
cl2 <- dfmm[j,2]
dfmm <- subset(dfmm, dfmm[,1]!=cl2 | dfmm[,2]!=cl1)
}
} else {
dfmm[nrow(df) + 1,] = c("No Values", 0, 0)
}
if (printHighest){
print(dfmm)
}
rlist <- list(dfCor, dfmm)
return (rlist)
}
EHModel_Regression_StandardLM <- function(df, y, splitRatio=.8, xseed = 0, vif=TRUE, tests = TRUE, avplots = FALSE, xstepAIC=TRUE, returnLM=FALSE) {
library(caTools)
library(Metrics)
if(xseed>0) {
set.seed(xseed)
}
par(mfcol=c(2,2))
fla <- substitute(n ~ ., list(n = as.name(y)))
if(splitRatio==1) {
mod_4 <- lm(fla, df)
} else {
i <- createDataPartition(unlist(df[,y]), p=splitRatio, list=FALSE)
test_reg <- df[-i,]
train_reg <- df[i,]
mod_4 <- lm(fla, train_reg)
}
if(xstepAIC){
step3 <- stepAIC(mod_4, trace=FALSE)
} else {
step3 <- mod_4
}
step3_summary <- summary(step3)
print(step3_summary)
if (vif){
print("VIF Analysis")
vif_values <- car::vif(step3)
print(vif_values)
}
print(plot(step3))
if (tests) {
library(lmtest)
print(bptest(step3))
print(shapiro.test(step3$residuals))
}
if (avplots) {
avPlots(step3)
}
print(paste("AIC: ", AIC(step3)))
if (splitRatio==1){
list_data <- c(step3, 0, 0, 0)
if(!returnLM) {
return(list_data)
}else{
return (step3)
}
} else {
pred_linreg <- predict(step3,test_reg)
resids <- test_reg[,y]-pred_linreg
rmse1 <- rmse( test_reg[,y],pred_linreg)
print(paste("RMSE on evaluation set: ", rmse1))
}
list_data <- c(step3, rmse1, step3_summary$sigma, resids)
if(!returnLM) {
return(list_data)
}else{
return (step3)
}
}
EHModel_Regression_Robust <- function(df, y, splitRatio=.8, xseed = 0) {
library(caTools)
library(Metrics)
if(xseed>0) {
set.seed(xseed)
}
fla <- substitute(n ~ ., list(n = as.name(y)))
fm <- as.formula(fla)
i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
test_reg <- df[-i,]
train_reg <- df[i,]
m1 <- rlm(fm, train_reg)
m1_summary <- summary(m1)
print(m1_summary)
pred_linreg <- predict(m1,test_reg)
resids <- test_reg[,y]-pred_linreg
rmse1 <- rmse( test_reg[,y],pred_linreg)
print(paste("RMSE: ", rmse1))
list_data <- list(c(m1), rmse1, m1_summary$sigma, resids)
return(list_data)
}
EHExplore_TwoCategoricalColumns_Barcharts <- function(df, y)
{
plot_list4 <- list()
df <- df %>% select_if(function(x) is.character(x)|is.factor(x))
df[,y] <- as.factor(df[,y])
for(i in 1:ncol(df)) {
df[,i] <- as.factor(df[ ,i])
p <- ggplot(df, aes_string(x=df[ , i], fill=y)) +
geom_bar(position = "fill") +
ylab("Proportion") +
xlab(colnames(df)[i]) +
stat_count(geom="text", aes(label=stat(count)), position=position_fill(vjust=.5), color="black") +
scale_color_d3()+
scale_fill_d3()+
theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "lightskyblue1", color="darkslategray")) +
ggtitle(paste("Number and Proportion of ", y, " by ", names(df)[i])) +
coord_flip()
p <- eval(substitute(p, list(i=i)))
plot_list4[[i]] <- p
}
return (plot_list4)
}
EHModel_Regression_Logistic <-function(df, y, splitRatio = .8, xseed = 0, returnLM=FALSE)
{
library(caTools)
library(ROCR)
if(xseed>0) {
set.seed(xseed)
}
if(splitRatio==1) {
fla <- substitute(n ~ ., list(n = as.name(y)))
logistic_model <- glm(fla,
data = df,
family = "binomial")
# Summary
print(summary(logistic_model))
listq = list()
listq[1] <- logistic_model
listq[2] <- 0
listq[3] <- 0
if(!returnLM) {
return(listq)
}else{
return (logistic_model)
}
}
i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
test_reg <- df[-i,]
train_reg <- df[i,]
fla <- substitute(n ~ ., list(n = as.name(y)))
logistic_model <- glm(fla,
data = train_reg,
family = "binomial")
# Summary
print(summary(logistic_model))
# Predict test data based on model
predict_reg <- predict(logistic_model,
test_reg, type = "response")
scored_class <- ifelse(predict_reg >0.5, 1, 0)
class <- test_reg[,y]
dfPred <- data.frame(class, scored_class)
dfPred$class <- as.factor(dfPred$class)
dfPred$scored_class <- as.factor(dfPred$scored_class)
q <-confusionMatrix(data = dfPred$scored_class, reference = dfPred$class)
print(q)
dfPred_raw <- data.frame(class, predict_reg)
roc(class ~ predict_reg, dfPred_raw)
roc1 <- roc(dfPred_raw$class,
dfPred_raw$predict_reg, plot=TRUE)
xauc <- roc1$auc
print(roc1)
listq = list()
listq[1] <- logistic_model
listq[2] <- q$overall['Accuracy']
listq[3] <- logistic_model$aic
listq[4] <- xauc
if(!returnLM) {
return(listq)
}else{
return (logistic_model)
}
}
EHPrepare_ScaleAllButTarget <-function(df, y)
{
df1 <- df %>%
dplyr::select(-{{y}})
df1 <- data.frame(scale(df1))
df2 <- df %>%
dplyr::select({{y}})
df3 <- cbind(df1,df2)
return(df3)
}
EHModel_Regression_Logistic_Iterations <- function(df, y, numOfIterations=100)
{
acc = list()
AIC = list()
AUC = list()
for (i in 1:numOfIterations)
{
q <- EHModel_Regression_Logistic(df, y)
acc[i]=q[2]
AIC[i]=q[3]
AUC[i] = q[4]
}
accv <- unlist(acc)
aveq <- mean(accv)
aicv <- unlist(AIC)
aicq <- mean(aicv)
aucv <- unlist(AUC)
aucq <- mean(aucv)
print(paste("Accuracy: ", aveq))
print(paste("AIC: ", aicq))
print(paste("AUC: ", aucq))
}
EHModel_Regression_Standard_Iterations <- function(df, y, numOfIterations=100)
{
rmse2 = list()
rse = list()
for (i in 1:numOfIterations)
{
q <- EHModel_Regression_StandardLM(df, y, xstepAIC=FALSE)
rmse2[i]=q[2]
rse[i]=q[3]
}
rsme2q <- unlist(rmse2)
rsme2m <- mean(rsme2q)
rsev <- unlist(rse)
rsem <- mean(rsev)
print(paste("Average RSME: ", rsme2m))
print(paste("Average RSE: ", rsem))
}
EHModel_Regression_Robust_Iterations <- function(df, y, numOfIterations=100)
{
rmse2 = list()
rse = list()
for (i in 1:numOfIterations)
{
q <- EHModel_Regression_Robust(df, y)
rmse2[i]=q[2]
rse[i]=q[3]
}
rsme2q <- unlist(rmse2)
rsme2m <- mean(rsme2q)
rsev <- unlist(rse)
rsem <- mean(rsev)
print(paste("Average RSME: ", rsme2m))
print(paste("Average RSE: ", rsem))
}
EHPrepare_CreateDummies <- function(df, target, include=list(), exclude=list(), removeColumn=TRUE)
{
#Error in top_vals$vals : $ operator is invalid for atomic vectors - this
#may simply mean one of your categorical variables only has one value
targ123 <- target
df3 <- df %>%
dplyr::select(-matches(targ123))
fact <- df3 %>%
dplyr::select(is.factor|is.character)
cols <- colnames(fact)
if(length(include>0)){
cols <- include
}
if(length(exclude>0)){
cols <- cols[! cols %in% exclude]
}
df4 <- fastDummies::dummy_cols(df, select_columns=cols, remove_selected_columns = removeColumn, remove_most_frequent_dummy = removeColumn, ignore_na=FALSE)
colnames(df4) <- make.names(colnames(df4))
return(df4)
}
EHPrepare_RestrictDataFrameColumnsToThoseInCommon <- function(df1, df2, exclude=list())
{
library(janitor)
cmp <- compare_df_cols(df1, df2)
cmp_No1 <- cmp %>%
dplyr::filter(is.na(df1))%>%
dplyr::filter(!column_name %in% exclude)
cmp_No1V <- cmp_No1$column_name
df2R <- df2 %>%
dplyr::select(!any_of(cmp_No1V))
cmp_No2 <- cmp %>%
dplyr::filter(is.na(df2)) %>%
dplyr::filter(!column_name %in% exclude)
cmp_No2V <- cmp_No2$column_name
df1R <- df1 %>%
dplyr::select(!any_of(cmp_No2V))
rlist <- list(df1R, df2R)
return(rlist)
}
EHPrepare_BoxCox <- function(df, col, print=TRUE, newcol=FALSE)
{
print("DO NOT USE!")
#For some reason you have to generate the formula in a line before the call. I can't generate it in the method because of environment reasons.
#So that means putting, e.g. "xformula = terget ~ 1" as a line before the call. Target is whatever our target is, the rest stays the same
#This doesn't fix it: https://stackoverflow.com/questions/74527907/r-how-do-i-pass-a-formula-to-the-linear-model-constructor-and-the-resulting-lin
library(MASS)
#For some reason boxcox fails if you use df as a parameter - so that's why it's df2
df2 <- as.data.frame(df)
hist(df2[,col], main=paste(col, "- Before"))
fla <- substitute(n ~ 1, list(n = as.name(col)))
#The problem is , that line stays in there so if you forget to change it you keep running the algorithm on the old variable even though you have sepcified a new one.
if(print) {
hist(df2[,col], main=paste(col, "- Before"))
}
#a<- qq #breaks the method - so it isn't used.
b <- boxcox(lm(xformula, df2))
lambda <- b$x[which.max(b$y)]
df2[, col] <- (df2[,col] ^ lambda - 1) / lambda
hist(df2[,col], main=paste(col, "- After"))
if(print) {
hist(df2[,col], main=paste(col, "- After, lambda =", lambda))
}
return(df2)
}
EHModel_DecisionTree <- function(df, target, seed=042760, levels=31, categorical=TRUE, printFancyTree=TRUE, printConfusionMatrix = TRUE, printDT=TRUE)
{
#"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
# There are two trees - the tree from caret (train(formula, ...)) is what the rmse is based on.
# The other tree is not - it is also the one influenced by the number of levels.This is the 'fancy tree.'
# I believe the fancy tree is also the one with all the stats.
targ123 = target
df4 <- df
if (categorical) {
df4[, targ123] <- as.factor(df4[, targ123])
}
fla <- substitute(n ~ ., list(n = as.name(targ123)))
set.seed(seed)
i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
dfEval <- df4[-i,]
dfTrain <- df4[i,]
count(dfTrain[targ123])
tc <- trainControl(method="cv", number=10)
metric <- "Accuracy"
library(rpart)
levels2 = levels-1
output.tree <- rpart(fla, data = dfTrain, control = rpart.control(maxdepth = levels2))
library(rpart.plot)
library(RColorBrewer)
library(rattle)
if(printFancyTree){
fancyRpartPlot(output.tree)
}
Formula = reformulate(".",response=targ123)
dt <- train(Formula, data=dfTrain, method="rpart")
if (printDT) {
library(rpart.plot)
rpart.plot(dt$finalModel)
}
predictions <- predict(dt, dfEval)
dfPred <- as.data.frame(predictions)
if (categorical) {
x <- factor(dfEval[, targ123])
y <- confusionMatrix(predictions, x)
if(printConfusionMatrix) {
print(y)
}
} else {
#load Metrics package
library(Metrics)
rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
print(paste('Decision tree - RMSE on evaluation set: ', rmseval))
}
x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
x1 <- x %>%
dplyr::rename("observeds" = 1) %>%
mutate(observeds = as.double(observeds)) %>%
mutate(predictions = as.double(predictions)) %>%
mutate(residuals = observeds - predictions)
newList <- list("dt" = dt, "errors" = x1)
return(newList)
}
EHModel_RandomForest <- function(df4, target, seed=042760, categorical=TRUE, printRF = TRUE, printVarimp=TRUE, printPlot=TRUE, printConfusionMatrix=TRUE)
{
#"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
#'Error in confusionMatrix.default(predictions, x) : the data cannot have more levels than the reference - This occured when the target was continuous
targ123 <- target
if (categorical) {
df4[, targ123] <- as.factor(df4[, targ123])
}
set.seed(seed)
i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
dfEval <- df4[-i,]
dfTrain <- df4[i,]
count(dfTrain[targ123])
tc <- trainControl(method="cv", number=10)
metric <- "Accuracy"
Formula = reformulate(".",response=targ123)
rf <- train(Formula, data=dfTrain, method="rf", trControl = tc)
if (printRF){
print(rf)
}
if (printPlot){
print(plot(rf))
}
if (printVarimp){
print(varImp(rf))
}
predictions <- predict(rf, dfEval)
dfPred <- as.data.frame(predictions)
if (categorical) {
x <- factor(dfEval[, targ123])
y <- confusionMatrix(predictions, x)
if (printConfusionMatrix){
print(y)
}
} else {
library(Metrics)
rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
print(paste('Random Forest - RMSE on evaluation set: ', rmseval))
}
print(paste("Parameters: mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
x1 <- x %>%
dplyr::rename("observeds" = 1) %>%
dplyr::mutate(observeds = as.double(observeds)) %>%
dplyr::mutate(predictions = as.double(predictions)) %>%
dplyr::mutate(residuals = observeds - predictions)
newList <- list("rf" = rf, "errors" = x1)
return(newList)
}
EHModel_SVM_ToReplace <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
#PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change.
#Scaling is done as part of pre-processing in train, so need not be done by hand.
#For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20). For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
targ123 <- target
df4[, targ123] <- as.factor(df4[, targ123])
set.seed(seed)
i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
dfEval <- df4[-i,]
dfTrain <- df4[i,]
count(dfTrain[targ123])
tc <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
library("stringi")
method1 <- stri_trans_totitle(method)
method2 <- paste0("svm", method1)
Formula = reformulate(".",response=targ123)
if (method1 == "Linear") {
svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
} else if (method1=="Radial"|method1=="Poly") {
if (cValue!=0 && sigmaValue!=0) {
svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
} else {
svm <- train(Formula, data=dfTrain, method=method2, trControl=tc, preProcess = c("center","scale"))
}
} else {
print("Unkown kernel. The choices are linear, radial or poly.")
retun()
}
if (printSVM){
print(svm)
}
if (printPlot){
print(plot(svm))
}
predictions <- predict(svm, dfEval)
dfPred <- as.data.frame(predictions)
x <- factor(dfEval[, targ123])
y <- confusionMatrix(predictions, x)
if (printConfusionMatrix){
print(y)
}
#print(paste("Parameters: mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
x1 <- x %>%
dplyr::rename("observeds" = 1) %>%
mutate(observeds = as.double(observeds)) %>%
mutate(predictions = as.double(predictions)) %>%
mutate(residuals = observeds - predictions)
newList <- list("svm" = svm, "errors" = x1)
return(newList)
}
EHModel_SVM <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
Print ("Use the 'TOReplace' one - this doesn't work right - it can't make predictions")
#PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change.
#Scaling is done as part of pre-processing in train, so need not be done by hand.
#For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20). For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
#"Error: `data` and `reference` should be factors with the same levels." may mean that the model cannot generate predicitons.
targ123 <- target
df4[, targ123] <- as.factor(df4[, targ123])
set.seed(seed)
i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
dfEval <- df4[-i,]
dfTrain <- df4[i,]
count(dfTrain[targ123])
tc <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
library("stringi")
method1 <- stri_trans_totitle(method)
method2 <- paste0("svm", method1)
xdf <- dfTrain %>%
dplyr::select(-targ123)
ydf <- as.numeric(dfTrain[,targ123])
if (method1 == "Linear") {
svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
} else if (method1=="Radial"|method1=="Poly") {
if (cValue!=0 && sigmaValue!=0) {
svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
} else {
svm <- train(xdf,ydf, method=method2, trControl=tc, preProcess = c("center","scale"))
}
} else {
print("Unkown kernel. The choices are linear, radial or poly.")
retun()
}
if (printSVM){
print(svm)
}
if (printPlot){
print(plot(svm))
}
predictions <- predict(svm, dfEval)
dfPred <- as.data.frame(predictions)
x <- factor(dfEval[, targ123])
y <- confusionMatrix(predictions, x)
if (printConfusionMatrix){
print(y)
}
#print(paste("Parameters: mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
x1 <- x %>%
dplyr::rename("observeds" = 1) %>%
mutate(observeds = as.double(observeds)) %>%
mutate(predictions = as.double(predictions)) %>%
mutate(residuals = observeds - predictions)
newList <- list("svm" = svm, "errors" = x1)
return(newList)
}
EHCalculate_AUC_ForBinaryClasses <- function(dfPredictions, printPlot=TRUE, printConfusionMatrix=FALSE)
{
#Observed come first, then Predictions!
library(caTools)
library(ROCR)
dfPred <- dfPredictions %>%
dplyr::rename("obs1"=1, "pred1"=2) %>%
dplyr::select(obs1, pred1)
dfPred1 <- dfPred
dfPred1$obs1a <- as.factor(dfPred1$obs1)
dfPred1$pred1a <- as.factor(dfPred1$pred1)
q <-confusionMatrix(data = dfPred1$pred1a, reference = dfPred1$obs1a)
if (printConfusionMatrix){
print(q)
}
roc1 <- roc(dfPred$obs1,
dfPred$pred1, plot=printPlot)
xauc <- roc1$auc
newList <- list("AUC" = xauc, "ConfusionMatrix" = q)
return(newList)
}
EHModel_Predict <- function(model, dftestData, testData_IDColumn, predictionsColumnName ="Predictions", threshold=0, writeFile="")
{
predictions <- predict(model,newdata=dftestData)
predictions <- data.frame(as.vector(predictions))
predictions[, testData_IDColumn] <- dftestData[, testData_IDColumn]
predictions[,c(1,2)] <- predictions[,c(2,1)]
colnames(predictions) <- c(testData_IDColumn, predictionsColumnName)
if (threshold>0){
predictions[, predictionsColumnName] <- ifelse(predictions[, predictionsColumnName]>threshold,1,0)
}
if (writeFile!="") {
write_csv(predictions, writeFile)
}
return(predictions)
}
EHPrepare_RemoveRecordsByRowNumber <- function(df, num)
{
#num can be a single number or a c() of numbers
df <- df[-c(num), ]
return (df)
}
EHPrepare_RemoveColumnsWithAllNA <- function(df)
{
#num can be a single number or a c() of numbers
df <- df |>
dplyr::select(where(~!all(is.na(.x))))
return (df)
}
EHSummarize_OverallColumnDescription <- function(df, summary=TRUE, str=FALSE, glimpse=TRUE, describe=FALSE)
{
library(Hmisc)
if(summary)
{
print(summary(df))
}
if(str)
{
print(str(df))
}
if(glimpse)
{
print(glimpse(df))
}
if(describe)
{
print(describe(df))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.