R/EHDataR.R

Defines functions EHSummarize_OverallColumnDescription EHPrepare_RemoveColumnsWithAllNA EHPrepare_RemoveRecordsByRowNumber EHModel_Predict EHCalculate_AUC_ForBinaryClasses EHModel_SVM EHModel_SVM_ToReplace EHModel_RandomForest EHModel_DecisionTree EHPrepare_BoxCox EHPrepare_RestrictDataFrameColumnsToThoseInCommon EHPrepare_CreateDummies EHModel_Regression_Robust_Iterations EHModel_Regression_Standard_Iterations EHModel_Regression_Logistic_Iterations EHPrepare_ScaleAllButTarget EHModel_Regression_Logistic EHExplore_TwoCategoricalColumns_Barcharts EHModel_Regression_Robust EHModel_Regression_StandardLM EHExplore_Multicollinearity EHSummarize_StandardPlots EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots EHExplore_TwoContinuousColumns_CorrelationsAndPValues EHExplore_TwoContinuousColumns_Scatterplots EHSummarize_SingleColumn_Histograms EHSummarize_SingleColumn_Boxplots EHSummarize_SingleColumn_Countplots EHSummarize_SingleColumn_BarCharts3 EHSummarize_SingleColumn_BarCharts2 EHSummarize_SingleColumn_BarCharts1 EHExplore_Interactions_Scatterplots EHPrepare_MissingValues_Imputation EHSummarize_MissingValues EHModel_ChiSquare EH_Theme_Histogram EHTheme_SlateGray2 EHTheme

library(devtools)
library(roxygen2)
library(Hmisc)
library(psych)
library(tidyverse)
library(skimr)
library(purrr)
library(tidyr)
library(tidyverse)
library(gridExtra)
library(lubridate)
library(fastDummies)
library(data.table)
library(mltools)
library(MASS)
library(car)
library(patchwork)
library(ggthemes)
library(tinytex)
library(stats)
library(ggsci)
library(scales)
library(naniar)
#library(Amelia)
library(caret)
library(pROC)
library(methods) 

EHTheme <- function(rectfill="cornsilk"){
  
  x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkblue"))
  
  return (x)
  
}

EHTheme_SlateGray2 <- function(rectfill="slategray2"){
  
  x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray"))
  
  return (x)
  
}

EH_Theme_Histogram <- function(font_size=7, hist_nbins=30){
  
  #Example of Usage:
  # ggplot(dfErrors, aes(x=residuals)) +
  #  ggtitle("Distribution of Residuals for Decision Tree") +
  #  q$geom_histogram +
  #  q$theme_histogram +
  #  q$density_Histogram
  
  theme_histogram <- theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8),  panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))
    
  geom_histogram <- geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density))) 
    
  density_histogram <- geom_density(col = "red")

  newList <- list("theme_histogram" = theme_histogram, "geom_histogram" = geom_histogram, "density_Histogram" = density_histogram)
  return(newList)
  
}

EHModel_ChiSquare <- function(df, column1, column2, print="Nothing")
{
  library(kableExtra)
  t <- table(df[,column1], df[,column2])
  test <- chisq.test(t)
  
  if (print=="table")
  {
    kableExtra::kable(t)
  }
  
  if (print=="result")
  {
    test
  }
  
  if (print=="both")
  {
    kableExtra::kable(t)
    test
  }
  
  xlist=list(t, test)
  return (xlist)
  
}


#' @exportClass EH_SummarizeData
EH_SummarizeData <- setRefClass("EH_Summarize", fields = list(df = "data.frame", 
                                              font_size = "numeric", y = "character"), methods = list( 
                                                StandardPlots = function() 
                                                { 
                                                  EHSummarize_StandardPlots(df, y)
                                                }, 
                                                  Histogram = function() 
                                                { 
                                                  EHSummarize_SingleColumn_Histograms(df)
                                                } 
                                              )) 


EHSummarize_MissingValues <- function(df)
{

  library(naniar)
  
  #1. Missing Completely at Random (MCAR):
  #2. Missing at Random (MAR):
  #3. Missing Not at Random (MNAR)
  
  list12 = list()
  
  list12[[1]] <- gg_miss_var(df)
  list12[[2]] <- vis_miss(df)
  list12[[3]] <- gg_miss_upset(df)
  
  return(list12)

  
}


EHPrepare_MissingValues_Imputation <- function(df, y="", impute = "mean", print_all = FALSE)
{
  
  #1. Missing Completely at Random (MCAR):
  #2. Missing at Random (MAR):
  #3. Missing Not at Random (MNAR)
    
    dfImputedMean <- df

    for(i in colnames(df))
      if(is.numeric(df[,i])){
        meanv <- mean(df[,i], na.rm = TRUE)  
        dfImputedMean[,i][is.na(df[,i])] <- meanv
      }
    

  
    dfImputedMedian <- df
    
    for(i in colnames(df))
      if(is.numeric(df[,i])){
        medianv <- median(df[,i], na.rm = TRUE)  
        dfImputedMedian[,i][is.na(df[,i])] <- medianv
      }
    
    if(y==""){
      if(impute=="mean"){
        return(dfImputedMean)
      } else if (impute=="median"){
        return(dfImputedMedian)
      }
    }
    
  dfOmit <- na.omit(df)
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  m1 <- lm(fla, dfImputedMean)
  step1 <- stepAIC(m1, trace=FALSE)
  s1 <- summary(step1)$adj.r.squared
  
  fla2 <- substitute(n ~ ., list(n = as.name(y)))
  m2 <- lm(fla2, dfImputedMedian)
  step2 <- stepAIC(m2, trace=FALSE)
  s2 <- summary(step2)$adj.r.squared
  
  fla3 <- substitute(n ~ ., list(n = as.name(y)))
  m3 <- lm(fla3, dfOmit)
  step3 <- stepAIC(m3, trace=FALSE)
  s3 <- summary(step3)$adj.r.squared
  
  l1 <- vector(mode = "list", length = 5)
  names(l1) <- c("df", "type", "r2mean", "r2median", "r2omit")
  
  l1$r2mean = s1
  l1$r2median = s2
  l1$r2omit = s3

  
  if (impute == "mean") {
    l1$type = "mean"
    l1$df=dfImputedMean
  }
  else if (impute == "median") {
    l1$type = "median"
    l1$df=dfImputedMedian
  }
  else if (impute == "omit") {
    l1$type = "omit"
    l1$df=dfOmit
  }
  
  
  print(c("type:", l1$type))
  print(c("r2mean:", round(l1$r2mean,4)))
  print(c("r2median:", round(l1$r2median,4)))
  print(c("r2omit", round(l1$r2omit,4)))
  
    if (print_all) {
      print(summary(step1))
      print(summary(step2))
      print(summary(step3))
    }
  
    return (l1$df)
}

EHExplore_Interactions_Scatterplots <- function(df, y, interaction, rectfill="lightskyblue") {
  
  #If you get these Errors:
  #Error: Unknown input: tbl_df' = you probably did not pass it a proper dataframe (probably a tibble instead)
  
  library(ggsci)
  
  df <- as.data.frame(df)
  
  df <- select_if(df, is.numeric)
  
  v <- as.vector(df[,interaction])

  xtext1 = as.data.frame(aggregate(data.frame(count = v), list(value = v), length))
  df[interaction][df[interaction] == "0"] <- paste0("0 (n=", xtext1$count[1], ")")
  df[interaction][df[interaction] == "1"] <- paste0("1 (n=", xtext1$count[2], ")")
  
  
  df[,interaction] <- as.factor(df[,interaction])
  
  plot_list <- list()
  
  for(i in 1:ncol(df)) {     
    
    p <- eval(substitute(ggplot(df, aes_string(df[ , i], y, color=interaction)) +
                           geom_point(alpha=.1) +
                           geom_smooth(method = "lm") +
                           xlab(colnames(df)[i]) +
                           theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_line(color="gray"), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
                           scale_color_d3()+
                           scale_fill_d3()+
                           ggtitle(colnames(df)[i]), list(i=i)))
    plot_list[[i]] <- p 
    
  }
  return(plot_list)
}

EHSummarize_SingleColumn_BarCharts1 <- function(df, font_size=7, rectfill="slategray2")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count, fill=Selection)) +
                           geom_col() +
                           scale_color_brewer(type = "div", palette = 8)+
                           scale_fill_brewer(type = "div", palette = 8)+  
                           theme(legend.position="none") +
                           ggtitle(colnames(df)[i]) +
                           theme(title = element_text(size =(font_size)), axis.title.x = element_blank(), axis.title.y = element_text(size = font_size), axis.text.x = element_text(size = font_size, angle=30, vjust=.5), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3)) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="black",
                                     vjust = 1), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}


EHSummarize_SingleColumn_BarCharts2 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    if (decreasingOrder){
      dfBar3$Selection <- factor(dfBar3$Selection,                                  
                                 levels = dfBar3$Selection[order(dfBar3$Count)])
    }
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
                           coord_flip() +
                           geom_col(color="black", size=.1, fill="ivory", width=.7) +
                           theme(legend.position="none") +
                           ggtitle(colnames(df)[i]) +
                           theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = font_size), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(),  panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}

EHSummarize_SingleColumn_BarCharts3 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2", title="")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    if (decreasingOrder){
      dfBar3$Selection <- factor(dfBar3$Selection,                                  
                                 levels = dfBar3$Selection[order(dfBar3$Count)])
    }
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
                           coord_flip() +
                           geom_col(color="black", size=.1, fill="ivory", width=.7) +
                           theme(legend.position="none") +
                           ylab(colnames(df)[i]) + 
                           ggtitle(title) +
                           theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_text(size = font_size), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(),  panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}

EHSummarize_SingleColumn_Countplots <- function(df, font_size=7, rectfill="slategray2", title="", decreasingOrder=TRUE)
{  
  df <- df |> dplyr::select(is.character|is.factor)
  
  df <- as.data.frame(unclass(df), stringsAsFactors = TRUE)
  
  plot_list2 <- list()
  
  
  for(i in 1:ncol(df)) {   
    
    ColName <- colnames(df)[i] 
    
    df2 <- df |>
      group_by(df[,i]) |>
      dplyr::summarize(Count=n()) |>
      dplyr::rename_at(1, ~ColName)
    
    if (decreasingOrder){
      #order hasn't been worked out
    }
    
    p <- eval(substitute(ggplot(df2, aes_string(x=ColName, y="Count")) +
                           coord_flip() +  
                           xlab(ColName)  +
                           ggtitle(title) +
                           theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill)) +
                           geom_bar(color="black", fill="white", stat="identity") +
                           geom_text(aes(label = Count), size=3, hjust = -.1), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}


EHSummarize_SingleColumn_Boxplots <- function(df, font_size=7)
{  
  df <- select_if(df, is.numeric)


plot_list2 <- list()

for(i in 1:ncol(df)) {     
  
  qp <- toString(head(sort(round(df[,i],2)),5))
  qz <- toString(tail(sort(round(df[,i],2)),5))
  qk <- str_c("L:   ", qp, "\\\n", "H:   ", qz)
  
  qk <- gsub('\\\\','', qk)
  
  p <- eval(substitute(ggplot(df, aes(df[,i])) +
                         coord_flip() +  
                         xlab(colnames(df)[i])  +
                         ylab(qk) +
                         theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
                         geom_boxplot(), list(i=i)))
  
  plot_list2[[i]] <- p 
  
  
}
return (plot_list2)
}


#dfTrain <- read.csv("C:\\Users\\erico\\Documents\\R\\CUNY_621\\Baseball\\moneyball-training-data.csv", header=TRUE)
#dfTrain <- dfTrain %>%
#  mutate(xq = ifelse(TEAM_PITCHING_H >1500, 1, 0))
#EHExplore_Correlations_Boxplots(dfTrain, "xq")

EHSummarize_SingleColumn_Histograms <- function(df, font_size = 7, hist_nbins = 20)
{
  
  df <- select_if(df, is.numeric)
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {     
    
    qp <- toString(head(sort(round(df[,i],2)),5))
    qz <- toString(tail(sort(round(df[,i],2)),5))
    qk <- str_c("L:   ", qp, "\\\n", "H:   ", qz)
    
    qk <- gsub('\\\\','', qk)
    
    p <- eval(substitute(ggplot(df, aes(df[,i])) +
                           ylab(colnames(df)[i])  +
                           xlab(qk) +
                           theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8),  panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))  + 
                           geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density))) +
                           geom_density(col = "red"), list(i=i)))
    plot_list2[[i]] <- p 
    
  }
  return (plot_list2)
}


EHExplore_TwoContinuousColumns_Scatterplots <- function(df, y, flip=FALSE, rectfill="slategray2", pointfill="white")
{
  plot_list <- list()
  
  df=as.data.frame(df)
  
  df <- select_if(df, is.numeric)
  
  for(i in 1:ncol(df)) {
    
    ct <- cor.test(df[,i], df[,y])
    
    xText <- str_c("Correlation: ", round(ct$estimate,2), "   p value: ", round(ct$p.value,2))
    
    x1 = df[[i]]
    y1 =y
    
    if(flip)
    {
      x1=y
      y1=df[[i]]
    }
    
    p <- ggplot(df, aes_string(x1, y1)) +
      geom_point(fill="navy", color=pointfill) +
      geom_smooth(method = "loess", color="red", fill="lightcoral") +
      ylab(y) +
      xlab(xText) +
      theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
      ggtitle(colnames(df)[i])
    
    p <- eval(substitute(p, list(i=i)))
    plot_list[[i]] <- p 
    
  }
  return(plot_list)
}

EHExplore_TwoContinuousColumns_CorrelationsAndPValues <- function(df, y)
  
  #Also works for one continuous and one binary
  # Error in cor.test.default(df[, i], df[, y]) :   'x' must be a numeric vector may occur if df is not recognized as a dataframe - even when it is derived from another dataframe
{
  df <- as.data.frame(df)
  df <- select_if(df, is.numeric)
  dfResult <- data.frame()
  
  
  for(i in 1:ncol(df)) {
    
    ct <- cor.test(df[,i], df[,y])
    #print (df[[i]])
    rw <- c(colnames(df)[i], round(ct$estimate,2), round(ct$p.value,2))
    dfResult <- rbind(dfResult, rw) 
  }
  
  colnames(dfResult) = c("column", "correlation", "p")   
  dfResult <- dfResult |>
    dplyr::arrange(p)
    
  return(dfResult)
  
}


EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots <- function(df, y, yCategorical=TRUE, rectfill="slategray2")
{
  plot_list3 <- list()
  
  #At this point, y has to be categorical, the only one and the last one
  #Error: Error in parse(text = x, keep.source = FALSE) : <text>:1:12: unexpected symbol may result if the previous rule is violated
  
  zz <- ncol(df) - 1
  
  for(i in 1:zz) {
    
    x1 = df[[i]]
    y1 =y
    
    
    p <- ggplot(df, aes_string(x1, y1, fill=y1)) +
      #xlab(colnames(df)[i])  +
      #ylab(xText) +
      theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
      scale_color_d3()+
      scale_fill_d3()+   
      theme(legend.position = "none") +
      ggtitle(colnames(df)[i]) +
      geom_boxplot()
    
    plot_list3[[i]] <- eval(substitute(p, list(i=i)))
    
  }
  
  plot_list3[1]
  
  return(plot_list3)
}


EHSummarize_StandardPlots <-function(data, y, return_list = FALSE, h_nbins = 20, print=TRUE, type="scatter")
{  
  #Error - ! Can't subset columns past the end. - may mean you passed a tibble, not a dataframe
  
  list1 <- EHSummarize_SingleColumn_Boxplots(data)
  list2 <- EHSummarize_SingleColumn_Histograms(data, hist_nbins =  h_nbins)
  
  if(type=="scatter"){
    list3 <- EHExplore_TwoContinuousColumns_Scatterplots(data, y)
  } else if (type=="box"){
    list3 <- EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots(data, y)
  }
  
  zz2 <- list()

  
  for(i in 1:length(list1)) {
    zz2[i*3-2] <- list1[i]
    zz2[i*3-1] <- list2[i]
    zz2[i*3] <- list3[i]
  }
  
  if (print) {
    lenZ <- length(zz2)
    quotient <- lenZ %/% 9
    gap <- lenZ - quotient*9
    gaprows <- gap/3
    
    if (lenZ>=9) {
    for(i in 1:quotient) { 
      
      start <- (i-1)*9 + 1
      finish <- start + 8
      
      grid.arrange(grobs=zz2[c(start:finish)], ncol=3)
      
    }
    }
    
    if (gaprows>0) {
      
      start <- quotient*9 + 1
      finish <- start + gaprows*3 - 1
      
      grid.arrange(grobs=zz2[c(start:finish)], ncol=3, nrow=gaprows)
    }  
  }
  
  if (return_list) {
    return (zz2)
  }
  
}

EHExplore_Multicollinearity <-function(df, printCorrs=FALSE, printHeatMap = TRUE, printHighest=FALSE, threshold=.85,  title="Heatmap for Multicollinearity Analysis") {
  
  #To print out only what you want, set the function to a variable, i.e. x <- EHExplore_Multicollinearity
  #If you see: Error in if ((mult2[i, j] > threshold | mult2[i, j] < -1 * threshold) &  : missing value where TRUE/FALSE needed it means there are missing values
  
  dfCor <- as.data.frame(cor(df))
  
  library(corrplot)
  my_matrix <- df[]
  cor_res <- cor(my_matrix, use = "na.or.complete")

  
  if (printCorrs) {
    print(dfCor)
  }
  
  if (printHeatMap) {
  my_matrix <- df[]
  cor_res <- cor(my_matrix, use = "na.or.complete")
  
  z <- corrplot(cor_res, title = title, mar=c(0,0,2,0), 
                diag=FALSE, type = "upper", order = "original", tl.col = "black", tl.srt = 45, tl.cex = 0.55)
  }
  
  dfmm <- data.frame(col1=character(),
                     col2=character(),
                     correlation=double())
  
  mult2 <- as.data.frame(dfCor)
  
  for(i in 1:ncol(mult2)) {       # for-loop over columns
    for(j in 1:nrow(mult2)) {
      
      if((mult2[i,j] >threshold | mult2[i,j] < -1*threshold) & mult2[i,j] != 1){
        v <- c(colnames(mult2[i]), colnames(mult2[j]), mult2[i,j])
        dfmm <- rbind(dfmm, data.frame(col1 =colnames(mult2[i]), col2 = colnames(mult2[j]), correlation= 
        mult2[i,j], stringsAsFactors = FALSE))
        }
    }
  }

if (nrow(dfmm)>0){
  
    nrow1 <- nrow(dfmm)/2
    
  for (j in 1:nrow1){
    cl1 <- dfmm[j,1]
    cl2 <- dfmm[j,2]
    
    dfmm <- subset(dfmm, dfmm[,1]!=cl2 | dfmm[,2]!=cl1)
  }    
} else {
  dfmm[nrow(df) + 1,] = c("No Values", 0, 0)
}
  
    if (printHighest){
    print(dfmm)  
  }
  
 rlist <- list(dfCor, dfmm)
  return (rlist)
  
}


EHModel_Regression_StandardLM <- function(df, y, splitRatio=.8, xseed = 0, vif=TRUE, tests = TRUE, avplots = FALSE, xstepAIC=TRUE, returnLM=FALSE) {
  
  library(caTools)
  library(Metrics)
  
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  par(mfcol=c(2,2))
  fla <- substitute(n ~ ., list(n = as.name(y)))
  
  if(splitRatio==1) {
    mod_4 <- lm(fla, df)
  } else {
    
    i <- createDataPartition(unlist(df[,y]), p=splitRatio, list=FALSE)
    
    test_reg <- df[-i,]
    train_reg <- df[i,]
    mod_4 <- lm(fla, train_reg)
  }
  
  if(xstepAIC){
  step3 <- stepAIC(mod_4, trace=FALSE)
  } else {
    step3 <- mod_4
  }
  
  step3_summary <- summary(step3)
  print(step3_summary)
  
  if (vif){
  print("VIF Analysis")
  vif_values <- car::vif(step3)
  print(vif_values)
  }
  
  print(plot(step3))
  
  if (tests) {
  library(lmtest)
  print(bptest(step3))
  
  print(shapiro.test(step3$residuals))
  }
  
  if (avplots) {
    avPlots(step3)
  }
  
  print(paste("AIC: ", AIC(step3)))
  
  if (splitRatio==1){
    
    list_data <- c(step3, 0, 0, 0)
    
    if(!returnLM) {
      return(list_data)
    }else{
      return (step3)
    }

    
  } else {
    pred_linreg <- predict(step3,test_reg)
    resids <- test_reg[,y]-pred_linreg
    
    rmse1 <- rmse( test_reg[,y],pred_linreg)
    print(paste("RMSE on evaluation set: ", rmse1))
  }
  
  list_data <- c(step3, rmse1, step3_summary$sigma, resids)
  
  if(!returnLM) {
  return(list_data)
  }else{
  return (step3)
  }
}
  
EHModel_Regression_Robust <- function(df, y, splitRatio=.8, xseed = 0) {
  
  library(caTools)
  library(Metrics)
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  fm <- as.formula(fla)
  
  i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
  
  test_reg <- df[-i,]
  train_reg <- df[i,]
    
    m1 <- rlm(fm, train_reg)
    m1_summary <- summary(m1)
    print(m1_summary)
  
    pred_linreg <- predict(m1,test_reg)
    resids <- test_reg[,y]-pred_linreg
    
    rmse1 <- rmse( test_reg[,y],pred_linreg)
    print(paste("RMSE: ", rmse1))
    
    list_data <- list(c(m1), rmse1, m1_summary$sigma, resids)
    
    return(list_data)

}


EHExplore_TwoCategoricalColumns_Barcharts <- function(df, y)
{
  
  plot_list4 <- list()
  
  df <- df %>% select_if(function(x) is.character(x)|is.factor(x))
  
  df[,y] <- as.factor(df[,y])
  
  for(i in 1:ncol(df)) {
    
    
    df[,i] <- as.factor(df[ ,i])
    
    p <- ggplot(df, aes_string(x=df[ , i], fill=y)) +
      geom_bar(position = "fill") +
      ylab("Proportion") +
      xlab(colnames(df)[i]) +
      stat_count(geom="text", aes(label=stat(count)), position=position_fill(vjust=.5), color="black") +
      scale_color_d3()+
      scale_fill_d3()+
      theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "lightskyblue1", color="darkslategray")) +
      ggtitle(paste("Number and Proportion of ", y, " by ", names(df)[i])) + 
      coord_flip()
    
    p <- eval(substitute(p, list(i=i)))
    plot_list4[[i]] <- p
  }
  
  return (plot_list4)
}

EHModel_Regression_Logistic <-function(df, y, splitRatio = .8, xseed = 0, returnLM=FALSE)
{
  library(caTools)
  library(ROCR)
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  if(splitRatio==1) {
    fla <- substitute(n ~ ., list(n = as.name(y)))
    
    logistic_model <- glm(fla,
                          data = df,
                          family = "binomial")
    
    # Summary
    print(summary(logistic_model))
    
    listq = list()
    listq[1] <- logistic_model
    listq[2] <- 0
    listq[3] <- 0
    
    
    if(!returnLM) {
      return(listq)
    }else{
      return (logistic_model)
    }
  }
  
  i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
  
  test_reg <- df[-i,]
  train_reg <- df[i,]
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  
  logistic_model <- glm(fla,
                        data = train_reg,
                        family = "binomial")
  
  # Summary
  print(summary(logistic_model))
  
  # Predict test data based on model
  predict_reg <- predict(logistic_model,
                         test_reg, type = "response")

  scored_class <- ifelse(predict_reg >0.5, 1, 0)
  class <- test_reg[,y]
  
  dfPred <- data.frame(class, scored_class)
  
  dfPred$class <- as.factor(dfPred$class)
  dfPred$scored_class <- as.factor(dfPred$scored_class)
  
  q <-confusionMatrix(data = dfPred$scored_class, reference = dfPred$class)
  print(q)
  
  dfPred_raw <- data.frame(class, predict_reg)
  
  roc(class ~ predict_reg, dfPred_raw)
  
roc1 <- roc(dfPred_raw$class,
              dfPred_raw$predict_reg, plot=TRUE)
xauc <- roc1$auc

print(roc1)

listq = list()
listq[1] <- logistic_model
listq[2] <- q$overall['Accuracy']
listq[3] <- logistic_model$aic
listq[4] <- xauc

if(!returnLM) {
  return(listq)
}else{
  return (logistic_model)
}

}


EHPrepare_ScaleAllButTarget <-function(df, y)
{
  
  df1 <- df %>%
    dplyr::select(-{{y}})
  
  df1 <- data.frame(scale(df1))
  df2 <- df %>%
    dplyr::select({{y}})
  
  df3 <- cbind(df1,df2)
  
  return(df3)
}

EHModel_Regression_Logistic_Iterations <- function(df, y, numOfIterations=100)
{
  
  acc = list()
  AIC = list()
  AUC = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_Logistic(df, y)
    acc[i]=q[2]
    AIC[i]=q[3]
    AUC[i] = q[4]
  }
  
  accv <- unlist(acc)
  aveq <- mean(accv)
  
  aicv <- unlist(AIC)
  aicq <- mean(aicv)
  
  aucv <- unlist(AUC)
  aucq <- mean(aucv)
  
  print(paste("Accuracy: ", aveq))
  print(paste("AIC: ", aicq))
  print(paste("AUC: ", aucq))
  
}

EHModel_Regression_Standard_Iterations <- function(df, y, numOfIterations=100)
{
  
  rmse2 = list()
  rse = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_StandardLM(df, y, xstepAIC=FALSE)
    rmse2[i]=q[2]
    rse[i]=q[3]
  }
  
  rsme2q <- unlist(rmse2)
  rsme2m <- mean(rsme2q)
  
  rsev <- unlist(rse)
  rsem <- mean(rsev)
  
  print(paste("Average RSME: ", rsme2m))
  print(paste("Average RSE: ", rsem))
  
  
}

EHModel_Regression_Robust_Iterations <- function(df, y, numOfIterations=100)
{
  
  rmse2 = list()
  rse = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_Robust(df, y)
    rmse2[i]=q[2]
    rse[i]=q[3]
  }
  
  rsme2q <- unlist(rmse2)
  rsme2m <- mean(rsme2q)
  
  rsev <- unlist(rse)
  rsem <- mean(rsev)
  
  print(paste("Average RSME: ", rsme2m))
  print(paste("Average RSE: ", rsem))
  
}

EHPrepare_CreateDummies <- function(df, target, include=list(), exclude=list(), removeColumn=TRUE)
{
    #Error in top_vals$vals : $ operator is invalid for atomic vectors - this 
    #may simply mean one of your categorical variables only has one value
    
    
    targ123 <- target

    df3 <-  df %>%
      dplyr::select(-matches(targ123))
    
      fact <- df3 %>%
      dplyr::select(is.factor|is.character)
    
    
    cols <- colnames(fact)
    
    
    if(length(include>0)){
      
      
      cols <- include
    }
    
    
    if(length(exclude>0)){
      
      
      cols <- cols[! cols %in% exclude]
    }
    
    df4 <- fastDummies::dummy_cols(df, select_columns=cols, remove_selected_columns = removeColumn, remove_most_frequent_dummy = removeColumn, ignore_na=FALSE)
    
    
    colnames(df4) <- make.names(colnames(df4))
    return(df4)
    
  }
  
  
EHPrepare_RestrictDataFrameColumnsToThoseInCommon <- function(df1, df2, exclude=list())
{
  
  library(janitor)
  cmp <- compare_df_cols(df1, df2)
  
  cmp_No1 <- cmp %>%
    dplyr::filter(is.na(df1))%>%
    dplyr::filter(!column_name %in% exclude)
  
  cmp_No1V <- cmp_No1$column_name
  
  df2R <- df2 %>%
    dplyr::select(!any_of(cmp_No1V))
  
  cmp_No2 <- cmp %>%
    dplyr::filter(is.na(df2)) %>%
    dplyr::filter(!column_name %in% exclude)
  
  cmp_No2V <- cmp_No2$column_name
  
  df1R <- df1 %>%
    dplyr::select(!any_of(cmp_No2V))
  
  
  rlist <- list(df1R, df2R)
  return(rlist)
  
}

EHPrepare_BoxCox <- function(df, col, print=TRUE, newcol=FALSE)
{
  print("DO NOT USE!")
  #For some reason you have to generate the formula in a line before the call. I can't generate it in the method because of environment reasons.
  #So that means putting, e.g. "xformula = terget ~ 1" as a line before the call.  Target is whatever our target is, the rest stays the same
  #This doesn't fix it: https://stackoverflow.com/questions/74527907/r-how-do-i-pass-a-formula-to-the-linear-model-constructor-and-the-resulting-lin
  
  library(MASS)
  #For some reason boxcox fails if you use df as a parameter - so that's why it's df2
  
  df2 <- as.data.frame(df)
  
  hist(df2[,col], main=paste(col, "- Before"))
  fla <- substitute(n ~ 1, list(n = as.name(col)))
  
  #The problem is , that line stays in there so if you forget to change it you keep running the algorithm on the old variable even though you have sepcified a new one.
  
  if(print) {
  hist(df2[,col], main=paste(col, "- Before"))
  }
  
  #a<- qq #breaks the method - so it isn't used.
  
  b <- boxcox(lm(xformula, df2))
  lambda <- b$x[which.max(b$y)]
  df2[, col] <- (df2[,col] ^ lambda - 1) / lambda

  hist(df2[,col], main=paste(col, "- After"))

  if(print) {
  hist(df2[,col], main=paste(col, "- After, lambda =", lambda))
  }

  return(df2)
  
}

EHModel_DecisionTree <- function(df, target, seed=042760, levels=31, categorical=TRUE, printFancyTree=TRUE, printConfusionMatrix = TRUE, printDT=TRUE)
{
  #"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
  # There are two trees - the tree from caret (train(formula, ...)) is what the rmse is based on.  
  # The other tree is not - it is also the one influenced by the number of levels.This is the 'fancy tree.'
  # I believe the fancy tree is also the one with all the stats.

    targ123 = target
    df4 <- df
  
  if (categorical) {
    df4[, targ123] <- as.factor(df4[, targ123])
  } 
  
  fla <- substitute(n ~ ., list(n = as.name(targ123)))
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
count(dfTrain[targ123])
  
  tc <- trainControl(method="cv", number=10)
  metric <- "Accuracy"
  
  
  library(rpart)
  
  levels2 = levels-1
  output.tree <- rpart(fla, data = dfTrain, control = rpart.control(maxdepth = levels2))
  
  
  library(rpart.plot)
  
  library(RColorBrewer)
  
  library(rattle)
  if(printFancyTree){
  fancyRpartPlot(output.tree)
  }
  
  Formula  = reformulate(".",response=targ123)
  dt <- train(Formula, data=dfTrain, method="rpart")
  
  if (printDT) {
    library(rpart.plot)
    rpart.plot(dt$finalModel)
  }  
  
  predictions <- predict(dt, dfEval)
  dfPred <- as.data.frame(predictions)
  
  if (categorical) {
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    if(printConfusionMatrix) {
    print(y)
    }
  } else {
    
    #load Metrics package
    library(Metrics)
    rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
    print(paste('Decision tree - RMSE on evaluation set: ', rmseval))
  }

  
  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("dt" = dt, "errors" = x1)
  return(newList)

}

EHModel_RandomForest <- function(df4, target, seed=042760, categorical=TRUE, printRF = TRUE, printVarimp=TRUE, printPlot=TRUE, printConfusionMatrix=TRUE)
{
  
  #"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
  #'Error in confusionMatrix.default(predictions, x) : the data cannot have more levels than the reference - This occured when the target was continuous
  
  targ123 <- target

  if (categorical) {
    df4[, targ123] <- as.factor(df4[, targ123])
  } 
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="cv", number=10)
  metric <- "Accuracy"
  
  
  Formula  = reformulate(".",response=targ123)
  rf <- train(Formula, data=dfTrain, method="rf", trControl = tc)
  
  if (printRF){
    print(rf)
  }

  if (printPlot){
    print(plot(rf))
  }
  if (printVarimp){
    print(varImp(rf))
  }
  
  
  predictions <- predict(rf, dfEval)
  dfPred <- as.data.frame(predictions)
  
  if (categorical) {
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    if (printConfusionMatrix){
    print(y)
    }
  } else {
    
    library(Metrics)
    rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
    print(paste('Random Forest - RMSE on evaluation set: ', rmseval))
  }
  
  print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))

  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
  dplyr::rename("observeds" = 1) %>%
  dplyr::mutate(observeds = as.double(observeds)) %>%
  dplyr::mutate(predictions = as.double(predictions)) %>%
  dplyr::mutate(residuals = observeds - predictions)

  
  newList <- list("rf" = rf, "errors" = x1)
  return(newList)
  
}

EHModel_SVM_ToReplace <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
  #PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change. 
  
  #Scaling is done as part of pre-processing in train, so need not be done by hand.
  #For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20).  For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
  
  targ123 <- target
  
    df4[, targ123] <- as.factor(df4[, targ123])

  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="repeatedcv", number=10, repeats=3)
  metric <- "Accuracy"
  
  library("stringi")     
  method1 <- stri_trans_totitle(method)
  method2 <- paste0("svm", method1)
  
  Formula  = reformulate(".",response=targ123)
  
  if (method1 == "Linear") {
        svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
  } else if (method1=="Radial"|method1=="Poly") {
      if (cValue!=0 && sigmaValue!=0) {
        svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
      } else {
        svm <- train(Formula, data=dfTrain, method=method2, trControl=tc, preProcess = c("center","scale")) 
      } 
  } else {
    print("Unkown kernel. The choices are linear, radial or poly.")
    retun()
  }
    
  
  if (printSVM){
    print(svm)
  }
  
  if (printPlot){
    print(plot(svm))
  }
 
  
  predictions <- predict(svm, dfEval)
  dfPred <- as.data.frame(predictions)
  
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    
    if (printConfusionMatrix){
      print(y)
    }
  
  #print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
  
    x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("svm" = svm, "errors" = x1)
  return(newList)
  
}

EHModel_SVM <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
  
  Print ("Use the 'TOReplace' one - this doesn't work right - it can't make predictions")
  #PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change. 
  
  #Scaling is done as part of pre-processing in train, so need not be done by hand.
  #For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20).  For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
  
  #"Error: `data` and `reference` should be factors with the same levels." may mean that the model cannot generate predicitons.  

  targ123 <- target
  
  df4[, targ123] <- as.factor(df4[, targ123])
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="repeatedcv", number=10, repeats=3)
  metric <- "Accuracy"
  
  library("stringi")     
  method1 <- stri_trans_totitle(method)
  method2 <- paste0("svm", method1)
  
  xdf <- dfTrain %>%
    dplyr::select(-targ123)
  ydf <- as.numeric(dfTrain[,targ123])
  
  if (method1 == "Linear") {
    svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
  } else if (method1=="Radial"|method1=="Poly") {
    if (cValue!=0 && sigmaValue!=0) {
      svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
    } else {
      svm <- train(xdf,ydf, method=method2, trControl=tc, preProcess = c("center","scale")) 
    } 
  } else {
    print("Unkown kernel. The choices are linear, radial or poly.")
    retun()
  }
  
  
  if (printSVM){
    print(svm)
  }
  
  if (printPlot){
    print(plot(svm))
  }
  
  
  predictions <- predict(svm, dfEval)
  dfPred <- as.data.frame(predictions)
  
  x <- factor(dfEval[, targ123])
  
  y <- confusionMatrix(predictions, x) 
  
  if (printConfusionMatrix){
    print(y)
  }
  
  #print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
  
  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("svm" = svm, "errors" = x1)
  return(newList)
  
}

EHCalculate_AUC_ForBinaryClasses <- function(dfPredictions, printPlot=TRUE, printConfusionMatrix=FALSE)
{
  
  #Observed come first, then Predictions!
  
  library(caTools)
  library(ROCR)
  
  dfPred <- dfPredictions %>%
    dplyr::rename("obs1"=1, "pred1"=2) %>%
    dplyr::select(obs1, pred1)
  

  dfPred1 <- dfPred
  dfPred1$obs1a <- as.factor(dfPred1$obs1)
  dfPred1$pred1a <- as.factor(dfPred1$pred1)
  q <-confusionMatrix(data = dfPred1$pred1a, reference = dfPred1$obs1a)
  
  if (printConfusionMatrix){
  print(q)
  }
  
  roc1 <- roc(dfPred$obs1,
              dfPred$pred1, plot=printPlot)
  xauc <- roc1$auc
  
  newList <- list("AUC" = xauc, "ConfusionMatrix" = q)
  return(newList)
  
}


EHModel_Predict <- function(model, dftestData, testData_IDColumn, predictionsColumnName ="Predictions", threshold=0, writeFile="")
{
  
  predictions <- predict(model,newdata=dftestData)
  predictions <- data.frame(as.vector(predictions)) 
  predictions[, testData_IDColumn] <- dftestData[, testData_IDColumn]
  predictions[,c(1,2)] <- predictions[,c(2,1)]
  colnames(predictions) <- c(testData_IDColumn, predictionsColumnName)
  
  if (threshold>0){
  predictions[, predictionsColumnName] <- ifelse(predictions[, predictionsColumnName]>threshold,1,0)
  }
  
  if (writeFile!="") {
    write_csv(predictions, writeFile) 
  }
  
  return(predictions)
}


EHPrepare_RemoveRecordsByRowNumber <- function(df, num)
{

  #num can be a single number or a c() of numbers
  df <- df[-c(num), ]
  return (df)
  
}

EHPrepare_RemoveColumnsWithAllNA <- function(df)
{
  
  #num can be a single number or a c() of numbers
  df <- df |>
    dplyr::select(where(~!all(is.na(.x))))
  return (df)
  
}

  EHSummarize_OverallColumnDescription <- function(df, summary=TRUE, str=FALSE, glimpse=TRUE, describe=FALSE)
  {
    library(Hmisc)
    
    if(summary)
    {
      print(summary(df))
    }
    
    if(str)
    {
      print(str(df))
    }
    
    if(glimpse)
    {
      print(glimpse(df))
    }
    
    if(describe)
    {
      print(describe(df))
    }
    
  }
ericonsi/EHData documentation built on April 12, 2025, 7:39 p.m.