EHData: Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

library(devtools)
library(roxygen2)
library(Hmisc)
library(psych)
library(tidyverse)
library(skimr)
library(purrr)
library(tidyr)
library(tidyverse)
library(gridExtra)
library(lubridate)
library(fastDummies)
library(data.table)
library(mltools)
library(MASS)
library(car)
library(patchwork)
library(ggthemes)
library(tinytex)
library(stats)
library(ggsci)
library(scales)
library(naniar)
#library(Amelia)
library(caret)
library(pROC)
library(methods) 

EHTheme <- function(rectfill="cornsilk"){
  
  x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkblue"))
  
  return (x)
  
}

EHTheme_SlateGray2 <- function(rectfill="slategray2"){
  
  x <- theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray"))
  
  return (x)
  
}

EH_Theme_Histogram <- function(font_size=7, hist_nbins=30){
  
  #Example of Usage:
  # ggplot(dfErrors, aes(x=residuals)) +
  #  ggtitle("Distribution of Residuals for Decision Tree") +
  #  q$geom_histogram +
  #  q$theme_histogram +
  #  q$density_Histogram
  
  theme_histogram <- theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8),  panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))
    
  geom_histogram <- geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density))) 
    
  density_histogram <- geom_density(col = "red")

  newList <- list("theme_histogram" = theme_histogram, "geom_histogram" = geom_histogram, "density_Histogram" = density_histogram)
  return(newList)
  
}

EHModel_ChiSquare <- function(df, column1, column2, print="Nothing")
{
  library(kableExtra)
  t <- table(df[,column1], df[,column2])
  test <- chisq.test(t)
  
  if (print=="table")
  {
    kableExtra::kable(t)
  }
  
  if (print=="result")
  {
    test
  }
  
  if (print=="both")
  {
    kableExtra::kable(t)
    test
  }
  
  xlist=list(t, test)
  return (xlist)
  
}


#' @exportClass EH_SummarizeData
EH_SummarizeData <- setRefClass("EH_Summarize", fields = list(df = "data.frame", 
                                              font_size = "numeric", y = "character"), methods = list( 
                                                StandardPlots = function() 
                                                { 
                                                  EHSummarize_StandardPlots(df, y)
                                                }, 
                                                  Histogram = function() 
                                                { 
                                                  EHSummarize_SingleColumn_Histograms(df)
                                                } 
                                              )) 


EHSummarize_MissingValues <- function(df)
{

  library(naniar)
  
  #1. Missing Completely at Random (MCAR):
  #2. Missing at Random (MAR):
  #3. Missing Not at Random (MNAR)
  
  list12 = list()
  
  list12[[1]] <- gg_miss_var(df)
  list12[[2]] <- vis_miss(df)
  list12[[3]] <- gg_miss_upset(df)
  
  return(list12)

  
}


EHPrepare_MissingValues_Imputation <- function(df, y="", impute = "mean", print_all = FALSE)
{
  
  #1. Missing Completely at Random (MCAR):
  #2. Missing at Random (MAR):
  #3. Missing Not at Random (MNAR)
    
    dfImputedMean <- df

    for(i in colnames(df))
      if(is.numeric(df[,i])){
        meanv <- mean(df[,i], na.rm = TRUE)  
        dfImputedMean[,i][is.na(df[,i])] <- meanv
      }
    

  
    dfImputedMedian <- df
    
    for(i in colnames(df))
      if(is.numeric(df[,i])){
        medianv <- median(df[,i], na.rm = TRUE)  
        dfImputedMedian[,i][is.na(df[,i])] <- medianv
      }
    
    if(y==""){
      if(impute=="mean"){
        return(dfImputedMean)
      } else if (impute=="median"){
        return(dfImputedMedian)
      }
    }
    
  dfOmit <- na.omit(df)
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  m1 <- lm(fla, dfImputedMean)
  step1 <- stepAIC(m1, trace=FALSE)
  s1 <- summary(step1)$adj.r.squared
  
  fla2 <- substitute(n ~ ., list(n = as.name(y)))
  m2 <- lm(fla2, dfImputedMedian)
  step2 <- stepAIC(m2, trace=FALSE)
  s2 <- summary(step2)$adj.r.squared
  
  fla3 <- substitute(n ~ ., list(n = as.name(y)))
  m3 <- lm(fla3, dfOmit)
  step3 <- stepAIC(m3, trace=FALSE)
  s3 <- summary(step3)$adj.r.squared
  
  l1 <- vector(mode = "list", length = 5)
  names(l1) <- c("df", "type", "r2mean", "r2median", "r2omit")
  
  l1$r2mean = s1
  l1$r2median = s2
  l1$r2omit = s3

  
  if (impute == "mean") {
    l1$type = "mean"
    l1$df=dfImputedMean
  }
  else if (impute == "median") {
    l1$type = "median"
    l1$df=dfImputedMedian
  }
  else if (impute == "omit") {
    l1$type = "omit"
    l1$df=dfOmit
  }
  
  
  print(c("type:", l1$type))
  print(c("r2mean:", round(l1$r2mean,4)))
  print(c("r2median:", round(l1$r2median,4)))
  print(c("r2omit", round(l1$r2omit,4)))
  
    if (print_all) {
      print(summary(step1))
      print(summary(step2))
      print(summary(step3))
    }
  
    return (l1$df)
}

EHExplore_Interactions_Scatterplots <- function(df, y, interaction, rectfill="lightskyblue") {
  
  #If you get these Errors:
  #Error: Unknown input: tbl_df' = you probably did not pass it a proper dataframe (probably a tibble instead)
  
  library(ggsci)
  
  df <- as.data.frame(df)
  
  df <- select_if(df, is.numeric)
  
  v <- as.vector(df[,interaction])

  xtext1 = as.data.frame(aggregate(data.frame(count = v), list(value = v), length))
  df[interaction][df[interaction] == "0"] <- paste0("0 (n=", xtext1$count[1], ")")
  df[interaction][df[interaction] == "1"] <- paste0("1 (n=", xtext1$count[2], ")")
  
  
  df[,interaction] <- as.factor(df[,interaction])
  
  plot_list <- list()
  
  for(i in 1:ncol(df)) {     
    
    p <- eval(substitute(ggplot(df, aes_string(df[ , i], y, color=interaction)) +
                           geom_point(alpha=.1) +
                           geom_smooth(method = "lm") +
                           xlab(colnames(df)[i]) +
                           theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_line(color="gray"), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
                           scale_color_d3()+
                           scale_fill_d3()+
                           ggtitle(colnames(df)[i]), list(i=i)))
    plot_list[[i]] <- p 
    
  }
  return(plot_list)
}

EHSummarize_SingleColumn_BarCharts1 <- function(df, font_size=7, rectfill="slategray2")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count, fill=Selection)) +
                           geom_col() +
                           scale_color_brewer(type = "div", palette = 8)+
                           scale_fill_brewer(type = "div", palette = 8)+  
                           theme(legend.position="none") +
                           ggtitle(colnames(df)[i]) +
                           theme(title = element_text(size =(font_size)), axis.title.x = element_blank(), axis.title.y = element_text(size = font_size), axis.text.x = element_text(size = font_size, angle=30, vjust=.5), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3)) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="black",
                                     vjust = 1), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}


EHSummarize_SingleColumn_BarCharts2 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    if (decreasingOrder){
      dfBar3$Selection <- factor(dfBar3$Selection,                                  
                                 levels = dfBar3$Selection[order(dfBar3$Count)])
    }
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
                           coord_flip() +
                           geom_col(color="black", size=.1, fill="ivory", width=.7) +
                           theme(legend.position="none") +
                           ggtitle(colnames(df)[i]) +
                           theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = font_size), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(),  panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}

EHSummarize_SingleColumn_BarCharts3 <- function(df, font_size=7, decreasingOrder=TRUE, rectfill="slategray2", title="")
{  
  
  dfBar2<-data.frame(lapply(df,factor))
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {   
    
    dfBar3 <- dfBar2 %>% 
      dplyr::group_by(dfBar2[,i]) %>% 
      dplyr::summarise(Count = n())
    
    dfBar3 <- as.data.frame(dfBar3) |>
      dplyr::rename(Selection = 1)
    
    if (decreasingOrder){
      dfBar3$Selection <- factor(dfBar3$Selection,                                  
                                 levels = dfBar3$Selection[order(dfBar3$Count)])
    }
    
    p <- eval(substitute(ggplot(dfBar3, aes(x=Selection, y=Count)) +
                           coord_flip() +
                           geom_col(color="black", size=.1, fill="ivory", width=.7) +
                           theme(legend.position="none") +
                           ylab(colnames(df)[i]) + 
                           ggtitle(title) +
                           theme(title = element_text(size =(font_size), face="bold"), axis.title.x = element_text(size = font_size), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_text(size = font_size), axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(),  panel.grid.major.y=element_line(color=rectfill), panel.background = element_rect(fill = rectfill, color="black", size = .3 )) +
                           geom_text(aes(label = Count), size=(3), fontface="bold", color="red", hjust = 1.5), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}

EHSummarize_SingleColumn_Countplots <- function(df, font_size=7, rectfill="slategray2", title="", decreasingOrder=TRUE)
{  
  df <- df |> dplyr::select(is.character|is.factor)
  
  df <- as.data.frame(unclass(df), stringsAsFactors = TRUE)
  
  plot_list2 <- list()
  
  
  for(i in 1:ncol(df)) {   
    
    ColName <- colnames(df)[i] 
    
    df2 <- df |>
      group_by(df[,i]) |>
      dplyr::summarize(Count=n()) |>
      dplyr::rename_at(1, ~ColName)
    
    if (decreasingOrder){
      #order hasn't been worked out
    }
    
    p <- eval(substitute(ggplot(df2, aes_string(x=ColName, y="Count")) +
                           coord_flip() +  
                           xlab(ColName)  +
                           ggtitle(title) +
                           theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill)) +
                           geom_bar(color="black", fill="white", stat="identity") +
                           geom_text(aes(label = Count), size=3, hjust = -.1), list(i=i)))
    
    plot_list2[[i]] <- p 
    
    
  }
  return (plot_list2)
}


EHSummarize_SingleColumn_Boxplots <- function(df, font_size=7)
{  
  df <- select_if(df, is.numeric)


plot_list2 <- list()

for(i in 1:ncol(df)) {     
  
  qp <- toString(head(sort(round(df[,i],2)),5))
  qz <- toString(tail(sort(round(df[,i],2)),5))
  qk <- str_c("L:   ", qp, "\\\n", "H:   ", qz)
  
  qk <- gsub('\\\\','', qk)
  
  p <- eval(substitute(ggplot(df, aes(df[,i])) +
                         coord_flip() +  
                         xlab(colnames(df)[i])  +
                         ylab(qk) +
                         theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
                         geom_boxplot(), list(i=i)))
  
  plot_list2[[i]] <- p 
  
  
}
return (plot_list2)
}


#dfTrain <- read.csv("C:\\Users\\erico\\Documents\\R\\CUNY_621\\Baseball\\moneyball-training-data.csv", header=TRUE)
#dfTrain <- dfTrain %>%
#  mutate(xq = ifelse(TEAM_PITCHING_H >1500, 1, 0))
#EHExplore_Correlations_Boxplots(dfTrain, "xq")

EHSummarize_SingleColumn_Histograms <- function(df, font_size = 7, hist_nbins = 20)
{
  
  df <- select_if(df, is.numeric)
  
  plot_list2 <- list()
  
  for(i in 1:ncol(df)) {     
    
    qp <- toString(head(sort(round(df[,i],2)),5))
    qz <- toString(tail(sort(round(df[,i],2)),5))
    qk <- str_c("L:   ", qp, "\\\n", "H:   ", qz)
    
    qk <- gsub('\\\\','', qk)
    
    p <- eval(substitute(ggplot(df, aes(df[,i])) +
                           ylab(colnames(df)[i])  +
                           xlab(qk) +
                           theme(axis.title.x = element_text(size = font_size), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8),  panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))  + 
                           geom_histogram(bins=hist_nbins, fill="white", aes(y = stat(density))) +
                           geom_density(col = "red"), list(i=i)))
    plot_list2[[i]] <- p 
    
  }
  return (plot_list2)
}


EHExplore_TwoContinuousColumns_Scatterplots <- function(df, y, flip=FALSE, rectfill="slategray2", pointfill="white")
{
  plot_list <- list()
  
  df=as.data.frame(df)
  
  df <- select_if(df, is.numeric)
  
  for(i in 1:ncol(df)) {
    
    ct <- cor.test(df[,i], df[,y])
    
    xText <- str_c("Correlation: ", round(ct$estimate,2), "   p value: ", round(ct$p.value,2))
    
    x1 = df[[i]]
    y1 =y
    
    if(flip)
    {
      x1=y
      y1=df[[i]]
    }
    
    p <- ggplot(df, aes_string(x1, y1)) +
      geom_point(fill="navy", color=pointfill) +
      geom_smooth(method = "loess", color="red", fill="lightcoral") +
      ylab(y) +
      xlab(xText) +
      theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
      ggtitle(colnames(df)[i])
    
    p <- eval(substitute(p, list(i=i)))
    plot_list[[i]] <- p 
    
  }
  return(plot_list)
}

EHExplore_TwoContinuousColumns_CorrelationsAndPValues <- function(df, y)
  
  #Also works for one continuous and one binary
  # Error in cor.test.default(df[, i], df[, y]) :   'x' must be a numeric vector may occur if df is not recognized as a dataframe - even when it is derived from another dataframe
{
  df <- as.data.frame(df)
  df <- select_if(df, is.numeric)
  dfResult <- data.frame()
  
  
  for(i in 1:ncol(df)) {
    
    ct <- cor.test(df[,i], df[,y])
    #print (df[[i]])
    rw <- c(colnames(df)[i], round(ct$estimate,2), round(ct$p.value,2))
    dfResult <- rbind(dfResult, rw) 
  }
  
  colnames(dfResult) = c("column", "correlation", "p")   
  dfResult <- dfResult |>
    dplyr::arrange(p)
    
  return(dfResult)
  
}


EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots <- function(df, y, yCategorical=TRUE, rectfill="slategray2")
{
  plot_list3 <- list()
  
  #At this point, y has to be categorical, the only one and the last one
  #Error: Error in parse(text = x, keep.source = FALSE) : <text>:1:12: unexpected symbol may result if the previous rule is violated
  
  zz <- ncol(df) - 1
  
  for(i in 1:zz) {
    
    x1 = df[[i]]
    y1 =y
    
    
    p <- ggplot(df, aes_string(x1, y1, fill=y1)) +
      #xlab(colnames(df)[i])  +
      #ylab(xText) +
      theme(title = element_text(size=9), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = rectfill, color="darkslategray")) +
      scale_color_d3()+
      scale_fill_d3()+   
      theme(legend.position = "none") +
      ggtitle(colnames(df)[i]) +
      geom_boxplot()
    
    plot_list3[[i]] <- eval(substitute(p, list(i=i)))
    
  }
  
  plot_list3[1]
  
  return(plot_list3)
}


EHSummarize_StandardPlots <-function(data, y, return_list = FALSE, h_nbins = 20, print=TRUE, type="scatter")
{  
  #Error - ! Can't subset columns past the end. - may mean you passed a tibble, not a dataframe
  
  list1 <- EHSummarize_SingleColumn_Boxplots(data)
  list2 <- EHSummarize_SingleColumn_Histograms(data, hist_nbins =  h_nbins)
  
  if(type=="scatter"){
    list3 <- EHExplore_TwoContinuousColumns_Scatterplots(data, y)
  } else if (type=="box"){
    list3 <- EHExplore_OneContinuousAndOneCategoricalColumn_Boxplots(data, y)
  }
  
  zz2 <- list()

  
  for(i in 1:length(list1)) {
    zz2[i*3-2] <- list1[i]
    zz2[i*3-1] <- list2[i]
    zz2[i*3] <- list3[i]
  }
  
  if (print) {
    lenZ <- length(zz2)
    quotient <- lenZ %/% 9
    gap <- lenZ - quotient*9
    gaprows <- gap/3
    
    if (lenZ>=9) {
    for(i in 1:quotient) { 
      
      start <- (i-1)*9 + 1
      finish <- start + 8
      
      grid.arrange(grobs=zz2[c(start:finish)], ncol=3)
      
    }
    }
    
    if (gaprows>0) {
      
      start <- quotient*9 + 1
      finish <- start + gaprows*3 - 1
      
      grid.arrange(grobs=zz2[c(start:finish)], ncol=3, nrow=gaprows)
    }  
  }
  
  if (return_list) {
    return (zz2)
  }
  
}

EHExplore_Multicollinearity <-function(df, printCorrs=FALSE, printHeatMap = TRUE, printHighest=FALSE, threshold=.85,  title="Heatmap for Multicollinearity Analysis") {
  
  #To print out only what you want, set the function to a variable, i.e. x <- EHExplore_Multicollinearity
  #If you see: Error in if ((mult2[i, j] > threshold | mult2[i, j] < -1 * threshold) &  : missing value where TRUE/FALSE needed it means there are missing values
  
  dfCor <- as.data.frame(cor(df))
  
  library(corrplot)
  my_matrix <- df[]
  cor_res <- cor(my_matrix, use = "na.or.complete")

  
  if (printCorrs) {
    print(dfCor)
  }
  
  if (printHeatMap) {
  my_matrix <- df[]
  cor_res <- cor(my_matrix, use = "na.or.complete")
  
  z <- corrplot(cor_res, title = title, mar=c(0,0,2,0), 
                diag=FALSE, type = "upper", order = "original", tl.col = "black", tl.srt = 45, tl.cex = 0.55)
  }
  
  dfmm <- data.frame(col1=character(),
                     col2=character(),
                     correlation=double())
  
  mult2 <- as.data.frame(dfCor)
  
  for(i in 1:ncol(mult2)) {       # for-loop over columns
    for(j in 1:nrow(mult2)) {
      
      if((mult2[i,j] >threshold | mult2[i,j] < -1*threshold) & mult2[i,j] != 1){
        v <- c(colnames(mult2[i]), colnames(mult2[j]), mult2[i,j])
        dfmm <- rbind(dfmm, data.frame(col1 =colnames(mult2[i]), col2 = colnames(mult2[j]), correlation= 
        mult2[i,j], stringsAsFactors = FALSE))
        }
    }
  }

if (nrow(dfmm)>0){
  
    nrow1 <- nrow(dfmm)/2
    
  for (j in 1:nrow1){
    cl1 <- dfmm[j,1]
    cl2 <- dfmm[j,2]
    
    dfmm <- subset(dfmm, dfmm[,1]!=cl2 | dfmm[,2]!=cl1)
  }    
} else {
  dfmm[nrow(df) + 1,] = c("No Values", 0, 0)
}
  
    if (printHighest){
    print(dfmm)  
  }
  
 rlist <- list(dfCor, dfmm)
  return (rlist)
  
}


EHModel_Regression_StandardLM <- function(df, y, splitRatio=.8, xseed = 0, vif=TRUE, tests = TRUE, avplots = FALSE, xstepAIC=TRUE, returnLM=FALSE) {
  
  library(caTools)
  library(Metrics)
  
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  par(mfcol=c(2,2))
  fla <- substitute(n ~ ., list(n = as.name(y)))
  
  if(splitRatio==1) {
    mod_4 <- lm(fla, df)
  } else {
    
    i <- createDataPartition(unlist(df[,y]), p=splitRatio, list=FALSE)
    
    test_reg <- df[-i,]
    train_reg <- df[i,]
    mod_4 <- lm(fla, train_reg)
  }
  
  if(xstepAIC){
  step3 <- stepAIC(mod_4, trace=FALSE)
  } else {
    step3 <- mod_4
  }
  
  step3_summary <- summary(step3)
  print(step3_summary)
  
  if (vif){
  print("VIF Analysis")
  vif_values <- car::vif(step3)
  print(vif_values)
  }
  
  print(plot(step3))
  
  if (tests) {
  library(lmtest)
  print(bptest(step3))
  
  print(shapiro.test(step3$residuals))
  }
  
  if (avplots) {
    avPlots(step3)
  }
  
  print(paste("AIC: ", AIC(step3)))
  
  if (splitRatio==1){
    
    list_data <- c(step3, 0, 0, 0)
    
    if(!returnLM) {
      return(list_data)
    }else{
      return (step3)
    }

    
  } else {
    pred_linreg <- predict(step3,test_reg)
    resids <- test_reg[,y]-pred_linreg
    
    rmse1 <- rmse( test_reg[,y],pred_linreg)
    print(paste("RMSE on evaluation set: ", rmse1))
  }
  
  list_data <- c(step3, rmse1, step3_summary$sigma, resids)
  
  if(!returnLM) {
  return(list_data)
  }else{
  return (step3)
  }
}
  
EHModel_Regression_Robust <- function(df, y, splitRatio=.8, xseed = 0) {
  
  library(caTools)
  library(Metrics)
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  fm <- as.formula(fla)
  
  i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
  
  test_reg <- df[-i,]
  train_reg <- df[i,]
    
    m1 <- rlm(fm, train_reg)
    m1_summary <- summary(m1)
    print(m1_summary)
  
    pred_linreg <- predict(m1,test_reg)
    resids <- test_reg[,y]-pred_linreg
    
    rmse1 <- rmse( test_reg[,y],pred_linreg)
    print(paste("RMSE: ", rmse1))
    
    list_data <- list(c(m1), rmse1, m1_summary$sigma, resids)
    
    return(list_data)

}


EHExplore_TwoCategoricalColumns_Barcharts <- function(df, y)
{
  
  plot_list4 <- list()
  
  df <- df %>% select_if(function(x) is.character(x)|is.factor(x))
  
  df[,y] <- as.factor(df[,y])
  
  for(i in 1:ncol(df)) {
    
    
    df[,i] <- as.factor(df[ ,i])
    
    p <- ggplot(df, aes_string(x=df[ , i], fill=y)) +
      geom_bar(position = "fill") +
      ylab("Proportion") +
      xlab(colnames(df)[i]) +
      stat_count(geom="text", aes(label=stat(count)), position=position_fill(vjust=.5), color="black") +
      scale_color_d3()+
      scale_fill_d3()+
      theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "lightskyblue1", color="darkslategray")) +
      ggtitle(paste("Number and Proportion of ", y, " by ", names(df)[i])) + 
      coord_flip()
    
    p <- eval(substitute(p, list(i=i)))
    plot_list4[[i]] <- p
  }
  
  return (plot_list4)
}

EHModel_Regression_Logistic <-function(df, y, splitRatio = .8, xseed = 0, returnLM=FALSE)
{
  library(caTools)
  library(ROCR)
  
  if(xseed>0) {
    set.seed(xseed)
  }
  
  if(splitRatio==1) {
    fla <- substitute(n ~ ., list(n = as.name(y)))
    
    logistic_model <- glm(fla,
                          data = df,
                          family = "binomial")
    
    # Summary
    print(summary(logistic_model))
    
    listq = list()
    listq[1] <- logistic_model
    listq[2] <- 0
    listq[3] <- 0
    
    
    if(!returnLM) {
      return(listq)
    }else{
      return (logistic_model)
    }
  }
  
  i <- createDataPartition(unlist(df[y]), p=splitRatio, list=FALSE)
  
  test_reg <- df[-i,]
  train_reg <- df[i,]
  
  fla <- substitute(n ~ ., list(n = as.name(y)))
  
  logistic_model <- glm(fla,
                        data = train_reg,
                        family = "binomial")
  
  # Summary
  print(summary(logistic_model))
  
  # Predict test data based on model
  predict_reg <- predict(logistic_model,
                         test_reg, type = "response")

  scored_class <- ifelse(predict_reg >0.5, 1, 0)
  class <- test_reg[,y]
  
  dfPred <- data.frame(class, scored_class)
  
  dfPred$class <- as.factor(dfPred$class)
  dfPred$scored_class <- as.factor(dfPred$scored_class)
  
  q <-confusionMatrix(data = dfPred$scored_class, reference = dfPred$class)
  print(q)
  
  dfPred_raw <- data.frame(class, predict_reg)
  
  roc(class ~ predict_reg, dfPred_raw)
  
roc1 <- roc(dfPred_raw$class,
              dfPred_raw$predict_reg, plot=TRUE)
xauc <- roc1$auc

print(roc1)

listq = list()
listq[1] <- logistic_model
listq[2] <- q$overall['Accuracy']
listq[3] <- logistic_model$aic
listq[4] <- xauc

if(!returnLM) {
  return(listq)
}else{
  return (logistic_model)
}

}


EHPrepare_ScaleAllButTarget <-function(df, y)
{
  
  df1 <- df %>%
    dplyr::select(-{{y}})
  
  df1 <- data.frame(scale(df1))
  df2 <- df %>%
    dplyr::select({{y}})
  
  df3 <- cbind(df1,df2)
  
  return(df3)
}

EHModel_Regression_Logistic_Iterations <- function(df, y, numOfIterations=100)
{
  
  acc = list()
  AIC = list()
  AUC = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_Logistic(df, y)
    acc[i]=q[2]
    AIC[i]=q[3]
    AUC[i] = q[4]
  }
  
  accv <- unlist(acc)
  aveq <- mean(accv)
  
  aicv <- unlist(AIC)
  aicq <- mean(aicv)
  
  aucv <- unlist(AUC)
  aucq <- mean(aucv)
  
  print(paste("Accuracy: ", aveq))
  print(paste("AIC: ", aicq))
  print(paste("AUC: ", aucq))
  
}

EHModel_Regression_Standard_Iterations <- function(df, y, numOfIterations=100)
{
  
  rmse2 = list()
  rse = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_StandardLM(df, y, xstepAIC=FALSE)
    rmse2[i]=q[2]
    rse[i]=q[3]
  }
  
  rsme2q <- unlist(rmse2)
  rsme2m <- mean(rsme2q)
  
  rsev <- unlist(rse)
  rsem <- mean(rsev)
  
  print(paste("Average RSME: ", rsme2m))
  print(paste("Average RSE: ", rsem))
  
  
}

EHModel_Regression_Robust_Iterations <- function(df, y, numOfIterations=100)
{
  
  rmse2 = list()
  rse = list()
  
  for (i in 1:numOfIterations)
  {
    q <- EHModel_Regression_Robust(df, y)
    rmse2[i]=q[2]
    rse[i]=q[3]
  }
  
  rsme2q <- unlist(rmse2)
  rsme2m <- mean(rsme2q)
  
  rsev <- unlist(rse)
  rsem <- mean(rsev)
  
  print(paste("Average RSME: ", rsme2m))
  print(paste("Average RSE: ", rsem))
  
}

EHPrepare_CreateDummies <- function(df, target, include=list(), exclude=list(), removeColumn=TRUE)
{
    #Error in top_vals$vals : $ operator is invalid for atomic vectors - this 
    #may simply mean one of your categorical variables only has one value
    
    
    targ123 <- target

    df3 <-  df %>%
      dplyr::select(-matches(targ123))
    
      fact <- df3 %>%
      dplyr::select(is.factor|is.character)
    
    
    cols <- colnames(fact)
    
    
    if(length(include>0)){
      
      
      cols <- include
    }
    
    
    if(length(exclude>0)){
      
      
      cols <- cols[! cols %in% exclude]
    }
    
    df4 <- fastDummies::dummy_cols(df, select_columns=cols, remove_selected_columns = removeColumn, remove_most_frequent_dummy = removeColumn, ignore_na=FALSE)
    
    
    colnames(df4) <- make.names(colnames(df4))
    return(df4)
    
  }
  
  
EHPrepare_RestrictDataFrameColumnsToThoseInCommon <- function(df1, df2, exclude=list())
{
  
  library(janitor)
  cmp <- compare_df_cols(df1, df2)
  
  cmp_No1 <- cmp %>%
    dplyr::filter(is.na(df1))%>%
    dplyr::filter(!column_name %in% exclude)
  
  cmp_No1V <- cmp_No1$column_name
  
  df2R <- df2 %>%
    dplyr::select(!any_of(cmp_No1V))
  
  cmp_No2 <- cmp %>%
    dplyr::filter(is.na(df2)) %>%
    dplyr::filter(!column_name %in% exclude)
  
  cmp_No2V <- cmp_No2$column_name
  
  df1R <- df1 %>%
    dplyr::select(!any_of(cmp_No2V))
  
  
  rlist <- list(df1R, df2R)
  return(rlist)
  
}

EHPrepare_BoxCox <- function(df, col, print=TRUE, newcol=FALSE)
{
  print("DO NOT USE!")
  #For some reason you have to generate the formula in a line before the call. I can't generate it in the method because of environment reasons.
  #So that means putting, e.g. "xformula = terget ~ 1" as a line before the call.  Target is whatever our target is, the rest stays the same
  #This doesn't fix it: https://stackoverflow.com/questions/74527907/r-how-do-i-pass-a-formula-to-the-linear-model-constructor-and-the-resulting-lin
  
  library(MASS)
  #For some reason boxcox fails if you use df as a parameter - so that's why it's df2
  
  df2 <- as.data.frame(df)
  
  hist(df2[,col], main=paste(col, "- Before"))
  fla <- substitute(n ~ 1, list(n = as.name(col)))
  
  #The problem is , that line stays in there so if you forget to change it you keep running the algorithm on the old variable even though you have sepcified a new one.
  
  if(print) {
  hist(df2[,col], main=paste(col, "- Before"))
  }
  
  #a<- qq #breaks the method - so it isn't used.
  
  b <- boxcox(lm(xformula, df2))
  lambda <- b$x[which.max(b$y)]
  df2[, col] <- (df2[,col] ^ lambda - 1) / lambda

  hist(df2[,col], main=paste(col, "- After"))

  if(print) {
  hist(df2[,col], main=paste(col, "- After, lambda =", lambda))
  }

  return(df2)
  
}

EHModel_DecisionTree <- function(df, target, seed=042760, levels=31, categorical=TRUE, printFancyTree=TRUE, printConfusionMatrix = TRUE, printDT=TRUE)
{
  #"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
  # There are two trees - the tree from caret (train(formula, ...)) is what the rmse is based on.  
  # The other tree is not - it is also the one influenced by the number of levels.This is the 'fancy tree.'
  # I believe the fancy tree is also the one with all the stats.

    targ123 = target
    df4 <- df
  
  if (categorical) {
    df4[, targ123] <- as.factor(df4[, targ123])
  } 
  
  fla <- substitute(n ~ ., list(n = as.name(targ123)))
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
count(dfTrain[targ123])
  
  tc <- trainControl(method="cv", number=10)
  metric <- "Accuracy"
  
  
  library(rpart)
  
  levels2 = levels-1
  output.tree <- rpart(fla, data = dfTrain, control = rpart.control(maxdepth = levels2))
  
  
  library(rpart.plot)
  
  library(RColorBrewer)
  
  library(rattle)
  if(printFancyTree){
  fancyRpartPlot(output.tree)
  }
  
  Formula  = reformulate(".",response=targ123)
  dt <- train(Formula, data=dfTrain, method="rpart")
  
  if (printDT) {
    library(rpart.plot)
    rpart.plot(dt$finalModel)
  }  
  
  predictions <- predict(dt, dfEval)
  dfPred <- as.data.frame(predictions)
  
  if (categorical) {
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    if(printConfusionMatrix) {
    print(y)
    }
  } else {
    
    #load Metrics package
    library(Metrics)
    rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
    print(paste('Decision tree - RMSE on evaluation set: ', rmseval))
  }

  
  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("dt" = dt, "errors" = x1)
  return(newList)

}

EHModel_RandomForest <- function(df4, target, seed=042760, categorical=TRUE, printRF = TRUE, printVarimp=TRUE, printPlot=TRUE, printConfusionMatrix=TRUE)
{
  
  #"Need to be the same factors" - Make sure to designate categorical=false if the targ123 is continuous
  #'Error in confusionMatrix.default(predictions, x) : the data cannot have more levels than the reference - This occured when the target was continuous
  
  targ123 <- target

  if (categorical) {
    df4[, targ123] <- as.factor(df4[, targ123])
  } 
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="cv", number=10)
  metric <- "Accuracy"
  
  
  Formula  = reformulate(".",response=targ123)
  rf <- train(Formula, data=dfTrain, method="rf", trControl = tc)
  
  if (printRF){
    print(rf)
  }

  if (printPlot){
    print(plot(rf))
  }
  if (printVarimp){
    print(varImp(rf))
  }
  
  
  predictions <- predict(rf, dfEval)
  dfPred <- as.data.frame(predictions)
  
  if (categorical) {
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    if (printConfusionMatrix){
    print(y)
    }
  } else {
    
    library(Metrics)
    rmseval <- rmse(dfEval[,targ123], dfPred$predictions)
    print(paste('Random Forest - RMSE on evaluation set: ', rmseval))
  }
  
  print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))

  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
  dplyr::rename("observeds" = 1) %>%
  dplyr::mutate(observeds = as.double(observeds)) %>%
  dplyr::mutate(predictions = as.double(predictions)) %>%
  dplyr::mutate(residuals = observeds - predictions)

  
  newList <- list("rf" = rf, "errors" = x1)
  return(newList)
  
}

EHModel_SVM_ToReplace <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
  #PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change. 
  
  #Scaling is done as part of pre-processing in train, so need not be done by hand.
  #For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20).  For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
  
  targ123 <- target
  
    df4[, targ123] <- as.factor(df4[, targ123])

  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="repeatedcv", number=10, repeats=3)
  metric <- "Accuracy"
  
  library("stringi")     
  method1 <- stri_trans_totitle(method)
  method2 <- paste0("svm", method1)
  
  Formula  = reformulate(".",response=targ123)
  
  if (method1 == "Linear") {
        svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
  } else if (method1=="Radial"|method1=="Poly") {
      if (cValue!=0 && sigmaValue!=0) {
        svm <- train(Formula, data=dfTrain, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
      } else {
        svm <- train(Formula, data=dfTrain, method=method2, trControl=tc, preProcess = c("center","scale")) 
      } 
  } else {
    print("Unkown kernel. The choices are linear, radial or poly.")
    retun()
  }
    
  
  if (printSVM){
    print(svm)
  }
  
  if (printPlot){
    print(plot(svm))
  }
 
  
  predictions <- predict(svm, dfEval)
  dfPred <- as.data.frame(predictions)
  
    x <- factor(dfEval[, targ123])
    y <- confusionMatrix(predictions, x) 
    
    if (printConfusionMatrix){
      print(y)
    }
  
  #print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
  
    x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("svm" = svm, "errors" = x1)
  return(newList)
  
}

EHModel_SVM <- function(df4, target, method = "linear", seed=042760, printSVM = TRUE, printPlot=FALSE, printConfusionMatrix =TRUE, cValue=0, sigmaValue=0)
{
  
  Print ("Use the 'TOReplace' one - this doesn't work right - it can't make predictions")
  #PROBLEM- formula (y ~ ) and a df takes 100 times longer than an x df and a y df!! Need to change. 
  
  #Scaling is done as part of pre-processing in train, so need not be done by hand.
  #For linear, c is tuned by the grid: expand.grid(C = seq(0.01, 2, length = 20).  For radial and poly, sigma and c are optimized automatically, UNLESS YOU SPECIFY BOTH (WOULD BE BETTER IN A LIST)
  
  #"Error: `data` and `reference` should be factors with the same levels." may mean that the model cannot generate predicitons.  

  targ123 <- target
  
  df4[, targ123] <- as.factor(df4[, targ123])
  
  set.seed(seed)
  
  i <- createDataPartition(unlist(df4[,targ123]), p=0.8, list=FALSE)
  
  dfEval <- df4[-i,]
  dfTrain <- df4[i,]
  
  count(dfTrain[targ123])
  
  tc <- trainControl(method="repeatedcv", number=10, repeats=3)
  metric <- "Accuracy"
  
  library("stringi")     
  method1 <- stri_trans_totitle(method)
  method2 <- paste0("svm", method1)
  
  xdf <- dfTrain %>%
    dplyr::select(-targ123)
  ydf <- as.numeric(dfTrain[,targ123])
  
  if (method1 == "Linear") {
    svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = seq(0.01, 2, length = 20)))
  } else if (method1=="Radial"|method1=="Poly") {
    if (cValue!=0 && sigmaValue!=0) {
      svm <- train(xdf,ydf, method=method2, trControl = tc, preProcess = c("center","scale"), tuneGrid = expand.grid(C = cValue, sigma=sigmaValue))
    } else {
      svm <- train(xdf,ydf, method=method2, trControl=tc, preProcess = c("center","scale")) 
    } 
  } else {
    print("Unkown kernel. The choices are linear, radial or poly.")
    retun()
  }
  
  
  if (printSVM){
    print(svm)
  }
  
  if (printPlot){
    print(plot(svm))
  }
  
  
  predictions <- predict(svm, dfEval)
  dfPred <- as.data.frame(predictions)
  
  x <- factor(dfEval[, targ123])
  
  y <- confusionMatrix(predictions, x) 
  
  if (printConfusionMatrix){
    print(y)
  }
  
  #print(paste("Parameters:   mtry = ", rf$finalModel$mtry, ", ntree = ", rf$finalModel$ntree, ", nrnodes = ", rf$finalModel$forest$nrnodes))
  
  x <- as.data.frame(cbind(dfEval[,targ123], dfPred))
  
  x1 <- x %>%
    dplyr::rename("observeds" = 1) %>%
    mutate(observeds = as.double(observeds)) %>%
    mutate(predictions = as.double(predictions)) %>%
    mutate(residuals = observeds - predictions)
  
  
  newList <- list("svm" = svm, "errors" = x1)
  return(newList)
  
}

EHCalculate_AUC_ForBinaryClasses <- function(dfPredictions, printPlot=TRUE, printConfusionMatrix=FALSE)
{
  
  #Observed come first, then Predictions!
  
  library(caTools)
  library(ROCR)
  
  dfPred <- dfPredictions %>%
    dplyr::rename("obs1"=1, "pred1"=2) %>%
    dplyr::select(obs1, pred1)
  

  dfPred1 <- dfPred
  dfPred1$obs1a <- as.factor(dfPred1$obs1)
  dfPred1$pred1a <- as.factor(dfPred1$pred1)
  q <-confusionMatrix(data = dfPred1$pred1a, reference = dfPred1$obs1a)
  
  if (printConfusionMatrix){
  print(q)
  }
  
  roc1 <- roc(dfPred$obs1,
              dfPred$pred1, plot=printPlot)
  xauc <- roc1$auc
  
  newList <- list("AUC" = xauc, "ConfusionMatrix" = q)
  return(newList)
  
}


EHModel_Predict <- function(model, dftestData, testData_IDColumn, predictionsColumnName ="Predictions", threshold=0, writeFile="")
{
  
  predictions <- predict(model,newdata=dftestData)
  predictions <- data.frame(as.vector(predictions)) 
  predictions[, testData_IDColumn] <- dftestData[, testData_IDColumn]
  predictions[,c(1,2)] <- predictions[,c(2,1)]
  colnames(predictions) <- c(testData_IDColumn, predictionsColumnName)
  
  if (threshold>0){
  predictions[, predictionsColumnName] <- ifelse(predictions[, predictionsColumnName]>threshold,1,0)
  }
  
  if (writeFile!="") {
    write_csv(predictions, writeFile) 
  }
  
  return(predictions)
}


EHPrepare_RemoveRecordsByRowNumber <- function(df, num)
{

  #num can be a single number or a c() of numbers
  df <- df[-c(num), ]
  return (df)
  
}

EHPrepare_RemoveColumnsWithAllNA <- function(df)
{
  
  #num can be a single number or a c() of numbers
  df <- df |>
    dplyr::select(where(~!all(is.na(.x))))
  return (df)
  
}

  EHSummarize_OverallColumnDescription <- function(df, summary=TRUE, str=FALSE, glimpse=TRUE, describe=FALSE)
  {
    library(Hmisc)
    
    if(summary)
    {
      print(summary(df))
    }
    
    if(str)
    {
      print(str(df))
    }
    
    if(glimpse)
    {
      print(glimpse(df))
    }
    
    if(describe)
    {
      print(describe(df))
    }
    
  }
ericonsi/EHData documentation built on April 12, 2025, 7:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ericonsi/EHData
Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

R/EHDataR.R
In ericonsi/EHData: Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

R Package Documentation

Browse R Packages

We want your feedback!

ericonsi/EHData Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

R/EHDataR.R In ericonsi/EHData: Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

R Package Documentation

Browse R Packages

We want your feedback!

ericonsi/EHData
Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch

R/EHDataR.R
In ericonsi/EHData: Stats, Data Wrangling, Connection and Plotting Libraries by Eric Hirsch