#' @title Generate Descriptive Statistics for a given dataset
#'
#' @description Generates a variety of descriptive statistics for either the categorical variables or the numeric variables of a given dataset.
#' Most notably for numeric variables; mean, median, variance, standard deviation, min, max, range, 1st quarile, 3rd quartile, inter quartile range, skeweness, kurtosis, Complete Observations, NA count and NA percentage.
#' Note, all the descriptive statistics are calculated with any NA values removed.
#' Most notably for categorical variables; number of levels, 1st and 2nd modes, count of the 1st and 2nd modes, percentage of the 1st and 2nd modes, amount of NA's and the percentage of NA's.
#'
#' @param dataset A dataset from which the descriptive statistics are derived.
#'
#' @param type A character object specfying the type of descriptive statistics to be derived.
#' One of three; either "numeric", "factor", "character".
#' Default is numeric
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'.
#' @return Outputs the descriptive statistics as a data frame.
#'
#' @import moments
#'
#' @export
#'
#' @seealso
#'
#' @examples
#' # Example Data
#' x1 <- rnorm(n = 60, mean = 50, sd = 10)
#' x2 <- rpois(n = 60, lambda = 50)
#' x3 <- sample(x = 1:10, size = 60, replace = TRUE)
#' x4 <- rep(x = c("yes", "no"), times = 30)
#' x5 <- rep(x = c("high", "medium", "low"), times = 20)
#' x6 <- sample(x = c("yes", "no"), size = 60, replace = TRUE)
#' # Save as a data frame
#' data <- as.data.frame(cbind(x1, x2, x3, x4, x5, x6))
#' # Derive descriptive statistics
#' descriptive_statistics(dataset = data, type = "numeric")
#' descriptive_statistics(dataset = data, type = "factor")
#'
descriptive_statistics <- function(dataset,
type = c("numeric", "factor", "character"),
file_name = NULL,
directory = NULL)
{
#-------------------------------------------------------------------------------#
# If Dataset is a Data Frame #
#-------------------------------------------------------------------------------#
if(is.data.frame(dataset)){
# Make sure the datset is converted to a data frame
dataset <- as.data.frame(dataset)
# Confirm correct choice for function
type <- match.arg(type)
if(type == "factor"){
# row index for categorical variables
r = 1
# create the data frame to hold the categorical descriptive statistics
# calculate the number of number factor variables in the dataset
factor_num_row <- sum(sapply(X = dataset, FUN = function(x) is.factor(x)))
factor_descriptive_statistics <- as.data.frame(matrix(nrow = factor_num_row, ncol = 11))
# assign the column names
colnames(factor_descriptive_statistics) <- c("Xi", "nlevels",
"1st mode", "1st mode cnt", "1st mode %",
"2nd mode", "2nd mode cnt", "2nd mode %",
"Complete Obs.", "NA cnt", "NA %")
for (i in 1:ncol(dataset)) {
if (is.factor(dataset[,i])){
# first write in the row name i.e. the attribute name
factor_descriptive_statistics[r, 1] <- colnames(dataset)[i]
# next compute the descriptive statistics for each cell in the row
# (1) Number of Levels
factor_descriptive_statistics[r, 2] <- nlevels(dataset[, i])
# (2) 1st Mode
factor_descriptive_statistics[r, 3] <- names(which.max(summary(dataset[,i])))
# (3) 1st Mode Count
factor_descriptive_statistics[r, 4] <- summary(dataset[, i])[which.max(summary(dataset[,i]))]
# (4) 1st Mode Percentage
factor_descriptive_statistics[r, 5] <- round(summary(dataset[, i])[which.max(summary(dataset[,i]))] * 100 / sum(summary(dataset[,i])),
digits = 4)
# (5) 2nd Mode
factor_descriptive_statistics[r, 6] <- names(sort(summary(dataset[, i]), decreasing = TRUE)[2])
# (6) 2nd Mode Count
factor_descriptive_statistics[r, 7] <- sort(summary(dataset[, i]), decreasing = TRUE)[2]
# (7) 2nd Mode Percentage
factor_descriptive_statistics[r, 8] <- round(sort(summary(dataset[, i]), decreasing = TRUE)[2] * 100/ sum(summary(dataset[,i])),
digits = 4)
# (8) Complete Observations
factor_descriptive_statistics[r, 9] <- sum(complete.cases(dataset[, i]))
# (9) NA Count
factor_descriptive_statistics[r, 10] <- sum(is.na(dataset[,i]))
#(10) NA Precentage
factor_descriptive_statistics[r, 11] <- round(((sum(is.na(dataset[,i])) / (nrow(dataset))) * 100),
digits = 4)
# Update row index
r = r + 1
}
}
if(!is.null(directory)) {
write.csv(x = factor_descriptive_statistics,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# Return the dataframe of descriptive statistics
return(factor_descriptive_statistics)
} else if(type == "numeric"){
k = 1 # row index for numeric variables
# create the data frame to hold the numeric descriptive statistics
numeric_num_col <- 16
numeric_num_row <- sum(sapply(X = dataset, FUN = function(x) is.numeric(x)))
numeric_descriptive_statistics <- as.data.frame(matrix(nrow = numeric_num_row, ncol = numeric_num_col))
colnames(numeric_descriptive_statistics) <- c("Xi", "Mean", "Median", "Var", "SD",
"min", "max", "range",
"1st Qrt", "3rd Qrt", "IQR",
"Skewness", "Kurtosis",
"Complete Obs.","NA cnt", "NA %")
for (i in 1:ncol(dataset)) {
if (is.numeric(dataset[,i])) {
# first write in the row name i.e. the aattribute name
numeric_descriptive_statistics[k, 1] <- colnames(dataset)[i]
# next compute the descriptive statistics for each cell in the row
# (1) Mean
numeric_descriptive_statistics[k, 2] <- round(mean(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (2) Median
numeric_descriptive_statistics[k, 3] <- round(median(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (3) Variance
numeric_descriptive_statistics[k, 4] <- round(var(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (4) Standard Deviation
numeric_descriptive_statistics[k, 5] <- round(sd(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (5) Minimum
numeric_descriptive_statistics[k, 6] <- round(min(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (6) Maximum
numeric_descriptive_statistics[k, 7] <- round(max(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (7) Range
numeric_descriptive_statistics[k, 8] <- round(max(x = dataset[,i],
na.rm = TRUE) -
min(dataset[,i],
na.rm = TRUE),
digits = 4)
# (8) 1st Quartile
numeric_descriptive_statistics[k, 9] <- round(quantile(x = dataset[,i],
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[2],
digits = 4)
# (9) 3rd Quartile
numeric_descriptive_statistics[k, 10] <- round(quantile(x = dataset[,i],
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[4],
digits = 4)
# (10) Interquartile Range
numeric_descriptive_statistics[k, 11] <- round(quantile(x = dataset[,i],
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[4] -
quantile(x = dataset[,i],
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[2],
digits = 4)
# (11) Skewness
numeric_descriptive_statistics[k, 12] <- round(skewness(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (12) Kurtosis
numeric_descriptive_statistics[k, 13] <- round(kurtosis(x = dataset[,i],
na.rm = TRUE),
digits = 4)
# (13) Complete Observations
numeric_descriptive_statistics[k, 14] <- sum(complete.cases(dataset[, i]))
# (14) NA Count
numeric_descriptive_statistics[k, 15] <- round(sum(is.na(dataset[,i])),
digits = 4)
# (15) NA Percentage
numeric_descriptive_statistics[k, 16] <- round(sum(is.na(dataset[,i])) * 100 / nrow(dataset),
digits = 4)
k = k + 1
}
}
if(!is.null(directory)) {
write.csv(x = numeric_descriptive_statistics,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# Return the dataframe of descriptive statistics
return(numeric_descriptive_statistics)
}
#-------------------------------------------------------------------------------#
# If Dataset is a Vector #
#-------------------------------------------------------------------------------#
} else if(is.vector(dataset)){
if(is.factor(dataset)){
# create the data frame to hold the categorical descriptive statistics
factor_descriptive_statistics <- as.data.frame(matrix(nrow = 1,
ncol = 11))
# assign the column names
colnames(factor_descriptive_statistics) <- c("Xi", "nlevels",
"1st mode", "1st mode cnt", "1st mode %",
"2nd mode", "2nd mode cnt", "2nd mode %",
"Complete Obs.", "NA cnt", "NA %")
# first write in the row name i.e. the aattribute name
factor_descriptive_statistics[1, 1] <- deparse(substitute(dataset))
# next compute the descriptive statistics for each cell in the row
# (1) Number of Levels
factor_descriptive_statistics[1, 2] <- nlevels(dataset)
# (2) 1st Mode
factor_descriptive_statistics[1, 3] <- names(which.max(summary(dataset)))
# (3) 1st Mode Count
factor_descriptive_statistics[1, 4] <- summary(dataset)[which.max(summary(dataset))]
# (4) 1st Mode Percentage
factor_descriptive_statistics[1, 5] <- round((summary(dataset)[which.max(summary(dataset))] * 100) / sum(summary(dataset)),
digits = 4)
# (5) 2nd Mode
factor_descriptive_statistics[1, 6] <- names(sort(summary(dataset),
decreasing = TRUE)[2])
# (6) 2nd Mode Count
factor_descriptive_statistics[1, 7] <- sort(summary(dataset),
decreasing = TRUE)[2]
# (7) 2nd Mode Percentage
factor_descriptive_statistics[1, 8] <- round(sort(summary(dataset),
decreasing = TRUE)[2] * 100/ sum(summary(dataset)),
digits = 4)
# (8) Complete Observations
factor_descriptive_statistics[1, 9] <- sum(complete.cases(dataset))
# (9) NA Count
factor_descriptive_statistics[1, 10] <- sum(is.na(dataset))
#(10) NA Precentage
factor_descriptive_statistics[1, 11] <- round(sum(is.na(dataset)) * 100 / length(dataset),
digits = 4)
if(!is.null(directory)) {
write.csv(x = factor_descriptive_statistics,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# Return the dataframe of descriptive statistics
return(factor_descriptive_statistics)
} else if(is.numeric(dataset)){
# create the data frame to hold the numeric descriptive statistics
numeric_descriptive_statistics <- as.data.frame(matrix(nrow = 1,
ncol = 16))
colnames(numeric_descriptive_statistics) <- c("Xi", "Mean", "Median", "Var", "SD",
"min", "max", "range",
"1st Qrt", "3rd Qrt", "IQR",
"Skewness", "Kurtosis",
"Complete Obs.","NA cnt", "NA %")
# first write in the row name i.e. the aattribute name
# first write in the row name i.e. the aattribute name
numeric_descriptive_statistics[1, 1] <- deparse(substitute(dataset))
# (1) Mean
numeric_descriptive_statistics[1, 2] <- round(mean(x = dataset,
na.rm = TRUE),
digits = 4)
# (2) Median
numeric_descriptive_statistics[1, 3] <- round(median(x = dataset,
na.rm = TRUE),
digits = 4)
# (3) Variance
numeric_descriptive_statistics[1, 4] <- round(var(x = dataset,
na.rm = TRUE),
digits = 4)
# (4) Standard Deviation
numeric_descriptive_statistics[1, 5] <- round(sd(x = dataset,
na.rm = TRUE),
digits = 4)
# (5) Minimum
numeric_descriptive_statistics[1, 6] <- round(min(x = dataset,
na.rm = TRUE),
digits = 4)
# (6) Maximum
numeric_descriptive_statistics[1, 7] <- round(max(x = dataset,
na.rm = TRUE),
digits = 4)
# (7) Range
numeric_descriptive_statistics[1, 8] <- round(max(x = dataset,
na.rm = TRUE) -
min(dataset,
na.rm = TRUE),
digits = 4)
# (8) 1st Quartile
numeric_descriptive_statistics[1, 9] <- round(quantile(x = dataset,
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[2],
digits = 4)
# (9) 3rd Quartile
numeric_descriptive_statistics[1, 10] <- round(quantile(x = dataset,
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[4],
digits = 4)
# (10) Interquartile Range
numeric_descriptive_statistics[1, 11] <- round(quantile(x = dataset,
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[4] -
quantile(x = dataset,
probs = seq(from = 0,
to = 1,
by = 0.25),
na.rm = TRUE)[2],
digits = 4)
# (11) Skewness
numeric_descriptive_statistics[1, 12] <- round(skewness(x = dataset,
na.rm = TRUE),
digits = 4)
# (12) Kurtosis
numeric_descriptive_statistics[1, 13] <- round(kurtosis(x = dataset,
na.rm = TRUE),
digits = 4)
# (13) Complete Observations
numeric_descriptive_statistics[1, 14] <- sum(complete.cases(dataset))
# (14) NA Count
numeric_descriptive_statistics[1, 15] <- round(sum(is.na(dataset)),
digits = 4)
# (15) NA Percentage
numeric_descriptive_statistics[1, 16] <- round(sum(is.na(dataset)) * 100 / length(dataset),
digits = 4)
if(!is.null(directory)) {
write.csv(x = numeric_descriptive_statistics,
file = paste(directory, "/", file_name, sep = ""),
row.names = F)
}
# Return the dataframe of descriptive statistics
return(numeric_descriptive_statistics)
}
}
# FUTURE NOTE: derive descriptive statistics for character variables
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.