# author Group 4 Isaac Newton
#' Generates an EDA report by plotting graphs and tables for the numeric variables, categorical variables, NA values and correlation in a dataframe
#'
#' @param dataframe tbl. The dataframe to be inspected.
#' @param cat_vars vector of character strings of the names of the categorical variables.
#' @param num_vars vector of character strings of the names of the numeric variables.
#'
#' @return It returns True on successful execution else returns False
#'
#' @export
#'
#' @examples
#' X <- dplyr::tibble(type = c('Car','Bus','Car'), height = c(10,20,30))
#' cat_vars <- c('type')
#' num_vars <- c('height')
#' generate_report(X, cat_vars, num_vars)
#'
generate_report <- function(dataframe, cat_vars, num_vars) {
try({
fun1 <- describe_num_var(dataframe, num_vars)
fun2 <- describe_cat_var(dataframe, cat_vars)
fun3 <- describe_na_values(dataframe)
fun4 <- calc_cor(dataframe, num_vars)
print(list(fun1, fun2, fun3, fun4))
return(TRUE)
})
return(FALSE)
}
#' Provides statistical summary of the numeric variables for a dataframe, such as the mean, median,
#' maximum and minimum for the numeric variables.
#'
#' @param dataframe tbl. The dataframe to be inspected.
#' @param num_vars vector of character strings of the names of the numeric variables the user want to explore.
#'
#' @return list. A list containing a dataframe with a statistical summary and a ggplot object with histograms of numeric variables faceted by each variable.
#'
#' @import tibble
#' @import dplyr
#' @import tidyr
#' @import ggplot2
#' @import rlang
#' @importFrom stats median quantile sd
#' @export
#'
#' @examples
#' X <- dplyr::tibble(type = c('Car', 'Bus', 'Car'), height = c(10, 20, 15), width = c(10, 15, 13))
#' num_vars <- c('height', 'width')
#' result <- describe_num_var(X, num_vars)
#' result$summary
#' result$plot
#'
describe_num_var <- function(dataframe, num_vars) {
# Check the input dataframe is a dataframe
if (!is.data.frame(dataframe)) {
stop("The value of the argument 'dataframe' should be of type 'data.frame' or 'tibble'.")
}
# Transform the input into tibble
dataframe <- as_tibble(dataframe)
# Chech the input num_vars is a vector of characters
if (!is.character(num_vars) | !is.vector(num_vars)) {
stop("The value of the argument 'num_vars' should be a vector of characters.")
}
# Check the input num_vars is a vector of column names of dataframe
if (!all(num_vars %in% colnames(dataframe))) {
stop("The argument 'num_vars' should be a subset of the column names of the dataframe.")
}
# Select the numeric variables to work with
df <- dataframe %>%
select(num_vars)
# Check if only the numeric variables are selected
if (!all(vapply(df, is.numeric, FUN.VALUE = logical(1)))) {
stop("Only numeric columns expected, please check the input.")
}
# Calculate the statistical summaries for all columns
stat_funs <- c(min, max, median, mean, sd)
temp <- c()
for (item in stat_funs) {
data_stat <- vapply(df, item, na.rm = TRUE, FUN.VALUE = numeric(1))
temp <- rbind(temp, data_stat)
}
data_quantile <- vapply(df, quantile, na.rm = TRUE, FUN.VALUE = numeric(5))
# Get the 25% and 75% quantiles
quantile_025 <- data_quantile[2, ]
quantile_075 <- data_quantile[4, ]
# Give the summaries meaningful names and make the result as a tibble
summary <- c("25%", "75%", "min", "max", "median", "mean", "sd")
summary_wo_name <- round(rbind(quantile_025, quantile_075, temp),3)
result <- as_tibble(cbind(summary, summary_wo_name))
# Plot the faceted histogram
data_plot <- df %>%
drop_na() %>%
gather() %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, color = 'gray') +
facet_wrap(~ key, scales = "free", ncol = 3)
return(list(summary = result, plot = data_plot))
}
#' Finds the frequency of the categorical variables in a data frame and returns the histogram of each categorical variable.
#'
#' @param dataframe tbl. The dataframe to be inspected.
#' @param cat_vars vector of character strings of the names of the categorical variables.
#'
#' @return ggplot object to plot histogram of the categorical variables
#'
#' @export
#'
#' @examples
#' X <- dplyr::tibble(type = c('Car','Bus','Car'), height = c(10,20,30))
#' cat_vars <- c('type')
#' describe_cat_var(X, cat_vars)
#'
describe_cat_var <- function(dataframe, cat_vars) {
# Check the input dataframe is a dataframe
if (!is.data.frame(dataframe)) {
stop("The value of the argument 'dataframe' should be of type 'data.frame' or 'tibble'.")
}
# Transform the input into tibble
dataframe <- as_tibble(dataframe)
# Chech the input cat_vars is a vector of characters
if (!is.character(cat_vars) | !is.vector(cat_vars)) {
stop("The value of the argument 'cat_vars' should be a vector of characters.")
}
# Check the input cat_vars is a vector of column names of dataframe
if (!all(cat_vars %in% colnames(dataframe))) {
stop("The argument 'cat_vars' should be a subset of the column names of the dataframe.")
}
# Select the categorical variables to work with
df <- dataframe %>%
select(cat_vars)
# Plot the faceted histogram
data_plot <- df %>%
drop_na() %>%
gather() %>%
ggplot(aes(x = value)) +
geom_bar(aes(y=..count..), color = 'gray',width = 0.6) +
facet_wrap(~ key, scales = "free", ncol = 3)+
theme(axis.text.x = element_text(angle = 90))+
ggtitle('Histogram of categorical variables')
return(data_plot)
}
#' Evaluates the correlation between the numeric columns of a given dataframe.
#'
#' @param df The dataframe to be inspected.
#' @param num_vars A list of strings of column names containing numeric variables.
#'
#' @return ggplot object; a correlation matrix plot labelled with the correlation coefficients of -1 to 1 between each numeric column and other numeric columns in the dataframe.
#'
#' @export
#'
#' @examples
#' df <- data.frame(x = (c(2,3,4)), y= c(1,10,3))
#' col_num <- c("x", "y")
#' calc_cor(df, col_num)
#'
calc_cor <- function(df, num_vars) {
# Check the input num_vars is a vector of characters
if (!is.character(num_vars) | !is.vector(num_vars)) {
stop("The value of the argument 'num_vars' should be a vector of characters.")
}
# Test dataframe input to check if dataframe is a dataframe
if (!is.data.frame(df))
stop("Input 'df' should be a dataframe.")
# Check the input dataframe is a dataframe
if (!is.data.frame(df)) {
stop("The value of the argument 'dataframe' should be of type 'data.frame' or 'tibble'.")
}
# Check the input num_vars is a vector of characters
if (!is.character(num_vars) | !is.vector(num_vars)) {
stop("The value of the argument 'num_vars' should be a vector of characters.")
}
# Check the input num_vars is a vector of column names of dataframe
if (!all(num_vars %in% colnames(df))) {
stop("The argument 'num_vars' should be a subset of the column names of the dataframe.")
}
# Find numerical columns and remove NA
df_num <- df[, num_vars]
df_num <- tidyr::drop_na(df_num)
# Test colums to check if columns provided are numeric
if (!all(sapply(df_num, is.numeric)))
stop("Columns do not all contain numeric values.")
# Find the correlation
df_cor <- round(stats::cor(df_num), 2)
# Plot the correlation matrix
corr_plot <- ggcorrplot::ggcorrplot(df_cor,
type = "lower",
outline.color = "white",
lab = TRUE,
title="Correlation Matrix",
colors = c("blue", "white", "red"),
lab_size = 8) +
ggplot2::theme(title=ggplot2::element_text(size=20),
axis.text=ggplot2::element_text(size=15, face='bold'),
text=ggplot2::element_text(size=15))
return(corr_plot)
}
#' Evaluates a dataframe for NA values.
#'
#' @param dataframe the dataframe to be inspected.
#'
#' @return a tibble; each column corresponds to the same column in dataframe and each value inside the column is 0 if the corresponding value is NA, 1 otherwise.
#' stops if the object passed in is not a data.frame or tibble.
#'
#' @export
#'
#' @examples
#' df <- data.frame(x = (c(2,3,4)), y= c(1,10,3))
#' col_num <- describe_na_values(df)
#'
#' #> # A tibble: 2 x 3
#' #> x y
#' #> <int> <int>
#' #> 1 1 1
#' #> 2 1 1
#' #> 3 1 1
#'
#' df <- data.frame(x = (c(2,NaN,4)), y= c(1,10,3))
#' col_num <- describe_na_values(df)
#'
#' #> # A tibble: 2 x 3
#' #> x y
#' #> <int> <int>
#' #> 1 1 1
#' #> 2 0 1
#' #> 3 1 1
#'
describe_na_values <- function(dataframe) {
if (!is.data.frame(dataframe)) {
stop("The value of the argument 'dataframe' should be of type 'data.frame' or 'tibble'.")
}
ret_dataframe <-purrr::map(dataframe, function(x){as.numeric(!is.na(x))})
ret_dataframe
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.