#' visualizeR - Automated exploratory data analysis for classification problems
#'
#' visualizeR automates exploratory data analysis for classification problems in machine learning. The problem can be two-class or multi-class classification. It is recommended that all ID and Date features be removed before running this algorithm, cleaning the data before running this is also recommended. visualizeR has some data cleaning aspects built into it but cannot account for domain knowledge cleaning.
#' @param df A data.frame object containing plotting features and target/outcome feature. Cannot be left blank.
#' @param Outcome The feature name of the outcome as character format, e.g. 'Target'. Cannot be left blank.
#' @param nrBins The number of bins to use in histogram plots of numerical features should 'stackedHist' be used as the chart type in the parameter 'NumChartType'.
#' @param sample Should a random sample be taken in order to speed the plotting process up.
#' @param clipOutliers Should outliers be fixed in the data using a median approach. Possible values: TRUE,FALSE
#' @param handleMissing Should missing values be corrected with 'Missing' value for categorical variables and median imputation for conitnuous variables. Possible values: TRUE,FALSE. Should this be left as 'N' then missing observations will be removed from the plots.
#' @param CatChartType Indicates the type of chart to use when plotting categorical/factor features. Possible values: 'stackedHist', 'Confusion'
#' @param NumChartType Indicates the type of chart to use when plotting numerical/continuous features. Possible values: 'stackedHist', 'densityLine', 'densityFill', 'boxPlot'
#' @param summaryStats Should summary statistics be printed for predictors in the dataset, summary stats for continuous and frequency tables for categorical variables. Possible values: TRUE,FALSE
#' @param seed Used only for the sampling of the data and to reproduce the plots.
#' @param maxLevels The maximum levels allowed for factor features, if a feature has levels more than the threshold it will not be plotted.
#' @param nrUniques The number of allowed unique values for a feature before it is automatically changed to a categorical feature. If a feature has less than this threshold, the feature will be changed to a categorical feature.
#' @param ouputPath A file path where the plots should be saved in a PDF document. If left blank all plots will be displayed in R.
#' @param outputFileName The name of the file containing all the plots.
#' @keywords visualizeR
#' @export
#' @examples
#' EXAMPLE 1:
#' library(datasets)
#' train <- data.frame(iris)
#'visualizeR(df = train,
#' Outcome = 'Species',
#' nrBins = 30,
#' sample = 1,
#' clipOutliers = 'Y',
#' CatChartType = 'stackedHist',
#' NumChartType = 'boxPlot')
#'
#'EXAMPLE 2:
#'visualizeR(df = train,
#'Outcome = 'Species',
#'nrBins = 30,
#'sample = 1,
#'clipOutliers = 'Y',
#'CatChartType = 'Confusion',
#'NumChartType = 'stackedHist',
#'summaryStats = 'Y',
#'outputPath = 'C:/Users/User/Documents',
#'outputFileName = 'IrisExploratoryDataAnalysis')
#' @author
#' Xander Horn
visualizeR <- function(df,
Outcome,
nrBins = 30,
sample = 0.3,
clipOutliers = TRUE,
handleMissing = TRUE,
CatChartType = 'stackedHist',
NumChartType = 'boxPlot',
summaryStats = FALSE,
seed = 1234,
maxLevels = 25,
nrUniques = 20,
outputPath = '',
outputFileName = 'outputPlots'){
if(missing(df)){
print("Please supply a data.frame object to visualizeR")
} else if(missing(Outcome)){
print("Please indicate which feature is the Outcome/Target/Label feature as type character in the format 'SomeTargetFeature'.")
} else {
#**************************************************************************************************
#START FUNCTION
#**************************************************************************************************
library(pacman)
p_load(dplyr,ggplot2,knitr,viridis,sqldf)
options(scipen=999)
print("visualizeR | 1. Partial Data Cleaning")
print("MISSING VALUES ARE ENCODED AS 'Missing' FOR CATEGORICAL AND MEDIAN IMPUTATION IS USED FOR NUMERIC")
#IF GRAPHS SHOULD BE OUTPUTTED TO A PDF FILE
if(outputPath != ''){
PDFPath = paste(outputPath,'/',outputFileName,'.pdf',sep='')
pdf(file=PDFPath)
}
#CHANGE OUTCOME FEATURE TO FACTOR FORMAT
df[,Outcome] <- as.factor(df[,Outcome])
#SAMPLING
set.seed(seed)
ind <- sample(nrow(df),sample*nrow(df),replace = F)
df <- df[ind,]
#REMOVE CATEGORICAL FEATURES WITH MORE THAN x LEVELS
remove <- length(ncol(df))
removeInd <- length(ncol(df))
remove <- NA
removeInd <- NA
for(i in 1:ncol(df)){
#CONVERT TO FACTOR FEATURES
if(class(df[,i]) %in% c('factor','character')){
df[,i] <- as.factor(toupper(as.character(df[,i])))
} else {
df[,i] <- as.numeric(df[,i])
}
if(class(df[,i]) == 'factor'){
ind <- length(levels(df[,i])) > maxLevels
removeInd[i] <- ifelse(ind ==T,i,NA)
remove[i] <- ifelse(ind ==T,names(df)[i],NA)
} else if(class(df[,i]) == 'numeric' & length(unique(df[,i])) < nrUniques){
print(paste("CHANGED FEATURE: ",names(df)[i]," TO A FACTOR FEATURE DUE TO LOW UNIQUE VALUES",sep=''))
df[,i] <- as.factor(df[,i])
}
}
if(sum(is.na(removeInd)) != length(removeInd)){
remove <- remove[!is.na(remove)]
removeInd <- removeInd[!is.na(removeInd)]
for(i in 1:length(remove)){
print(paste("REMOVED FEATURE: ",remove[i]," , HAS TOO MANY LEVELS TO PLOT",sep=''))
}
df <- df[,-removeInd]
}
print("visualizeR | 2. Feature Plotting")
#CLIP OUTLIERS
for(i in 1:ncol(df)){
print(paste("Plotting Feature",names(df)[i],",",i," Of ",ncol(df),": Missing Observations = ",sum(is.na(df[,i]))))
if(toupper(clipOutliers) == TRUE & class(df[,i]) == 'numeric'){
feature <- names(df)[i]
threshold <- quantile(df[,feature],0.99,na.rm = T)
threshold2 <- quantile(df[,feature],0.01,na.rm = T)
outliers <- which(df[,feature] >= threshold)
outliers2 <- which(df[,feature] <= threshold2)
median <- round(median(df[c(-outliers,-outliers2),feature],na.rm=T))
df[outliers,feature] <- median
df[outliers2,feature] <- median
}
#CATEGORICAL FEATURES
if(class(df[,i]) != 'numeric'){
if(handleMissing == TRUE){
df[,i] <- as.factor(ifelse(is.na(as.character(df[,i])) == T,'Missing',as.character(df[,i])))
}
#STACKED HISTOGRAM
if(CatChartType == 'stackedHist'){
if(length(levels(df[,i])) <= 2){
vizCat <- df %>%
ggplot(aes(x = df[,Outcome], fill = df[,i])) +
geom_bar(width = 0.6, position = "fill") +
scale_fill_viridis(name = paste(names(df)[i],": Levels",sep=''), discrete = T, begin = 0.5,end = 0.9) +
ggtitle(paste("Outcome By ",names(df)[i],sep='')) +
labs(x = "Outcome", y ="Percentage Freq")
} else {
vizCat <- df %>%
ggplot(aes(x = df[,Outcome], fill = df[,i])) +
geom_bar(width = 0.6, position = "fill") +
scale_fill_viridis(name = paste(names(df)[i],": Levels",sep=''), discrete = T) +
ggtitle(paste("Outcome By ",names(df)[i],sep='')) +
labs(x = "Outcome", y ="Percentage Freq")
}
#CONFUSION PLOT
} else if(CatChartType == 'Confusion'){
vizCat <- ggplot(data=df,aes(df[,i],df[,Outcome]))+
geom_bin2d(bins=nrBins)+
theme(axis.text.x = element_text(angle=90))+
#scale_fill_viridis(name = "Outcome Levels", discrete = F) +
xlab(names(df)[i])+
ylab('Outcome')+
ggtitle(paste("Outcome Vs ",names(df)[i],sep=''))
}
#CONTINEOUS FEATURES
} else {
if(handleMissing == TRUE){
df[,i] <- ifelse(is.na(df[,i]) == T,median(df[,i],na.rm = T),df[,i])
}
#HISTOGRAM CHART TYPE
if(NumChartType == 'stackedHist'){
vizNum <- ggplot(data = df, aes(x = df[,i],fill = df[,Outcome], colour = df[,Outcome])) +
geom_histogram(bins = nrBins, alpha = 0.8) +
ggtitle(paste("Distribution By Outcome",sep='')) +
labs(x = names(df)[i], y ="Freq") +
guides(fill = guide_legend(reverse = TRUE), colour = guide_legend(reverse = TRUE))
#DENSITY LINE CHART TYPE
} else if(NumChartType == 'densityLine') {
vizNum <- qplot(df[,i], data=df, geom="density", colour=df[,Outcome],size=I(1),
main="Density Distribution By Outcome", xlab=paste(names(df)[i]),
ylab="Density")
#DENSITY FILL CHART TYPE
} else if(NumChartType == 'densityFill'){
vizNum <- ggplot(df, aes(x = df[,i], fill = df[,Outcome])) +
geom_density(alpha = 0.5) +
ggtitle(paste("Density Distribution By Outcome",sep='')) +
labs(x = names(df)[i], y ="Density") +
guides(fill = guide_legend(reverse = TRUE), colour = guide_legend(reverse = TRUE))
#BOX PLOTS
} else if(NumChartType == 'boxPlot'){
vizNum <- ggplot(df, aes(x = df[, Outcome],
y = df[, i], color = df[, Outcome])) +
geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle(paste("Outcome By ", names(df)[i],
sep = "")) + labs(x = "Outcome", y = names(df)[i])
}
}
viz <- list()
if(names(df)[i] == Outcome){
viz <- qplot(df[,Outcome], data = df, fill=df[,Outcome],color = df[,Outcome], main = "Outcome Distribution") +
labs(x = "Outcome", y ="Freq")
} else if(class(df[,i]) == 'factor'){
viz <- vizCat
} else {
viz <- vizNum
}
print(viz)
}
if(toupper(summaryStats) == TRUE){
print("visualizeR | 3. Summary Statistics")
for(i in 1:ncol(df)){
if(class(df[,i]) == 'numeric'){
print(paste('Feature :',names(df)[i],sep=''))
print(summary(df[,i]))
} else {
print(paste('Feature: ',names(df)[i],sep=''))
print(prop.table(table(df[,i])))
}
}
}
if(outputPath != ''){
dev.off()
}
invisible(gc)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.