#'
#' This function analyzes the data quality of a table. It gives the dimensions of the
#' table along with the number of unique values and missing values/ It also outputs a
#' table with the number of unique values, the type and the percentage of missing values
#' by variable.
#'
#' @param data the table to analyzed
#' @param numeric_cutoff an integer specifying the minimal number of unique values
#' necessary for a vector not to be considered a factor. Default to -1
#' @param na_type a character vector of strings that will be interpreted as NA
#'
#' @param return a list with two elements, one with global information (dimensions, unique
#' values and missing values) and a table with information for each variable.
#'
#' @export
#' @import data.table
#'
quality <- function (data, numeric_cutoff = -1, na_type = c("", " ")){
# Arguments check
if(!is.data.frame(data) & !is.data.table(data))
stop("'data' must either be a data.frame or a data.table.")
if(!is.data.table(data)) data <- as.data.table(data)
if(!(is.numeric(numeric_cutoff) & length(numeric_cutoff) == 1))
stop("'numeric_cutoff' must be numeric of length one.")
if(!is.null(na_type)) if(!is.character(na_type))
stop("'na_type' must be a character vector.")
# set vaues in na_type to NA
if(!is.null(na_type)){
for(j in seq_along(data)){
set(data, i = which(data[[j]] %in% na_type), j = j, value = NA)
}
}
# is_** function to define type
is_categorical <- function(x) is.factor(x) || is.character(x) || uniqueN(x) <= numeric_cutoff
is_numeric <- function(x) is.numeric(x) & uniqueN(x) > numeric_cutoff
is_date <- function(x) inherits(x, 'Date') | inherits(x, 'POSIXct') | inherits(x, 'POSIXlt')
# global info
n_cols <- ncol(data)
n_rows <- nrow(data)
n_unique <- nrow(unique(data))
n_missing <- sum(is.na(data))
# columns types
categorical_var <- which(sapply(X = colnames(data),
FUN = function(name) is_categorical(data[[name]])) == TRUE)
numeric_var <- which(sapply(X = colnames(data),
FUN = function(name) is_numeric(data[[name]])) == TRUE)
date_var <- which(sapply(X = colnames(data),
FUN = function(name) is_date(data[[name]])) == TRUE)
logical_var <- which(sapply(X = colnames(data),
FUN = function(name) is.logical(data[[name]])) == TRUE)
# data quality output
types <- rep(x = "undefined", length = n_cols)
types[categorical_var] <- "character"
types[numeric_var] <- "numeric"
types[date_var] <- "date"
types[logical_var] <- 'logical'
n_miss <- colSums(is.na(data))
percent_miss <- 100 * n_miss / n_rows
n_unique_values <- sapply(data, uniqueN)
output_global <- data.table(
names(n_miss),
types,
n_miss,
as.numeric(format(percent_miss, digits = 0)),
n_unique_values
)
colnames(output_global) <- c("Variable", "Type", "Missing values",
"Percentage of missing values", "Unique values")
return(list(global = list(n_cols = n_cols, n_rows = n_rows, n_unique = n_unique,
n_missing = n_missing),
table = output_global))
}
#'
#' This function plots the missing values in a dataset. If a value is missing in a column
#' for a given row then a bar is displayed.
#'
#' @param data a data.frame
#' @param order logical, whether to order the columns and rows to display the missing
#' values next to each other, defautl to FALSE.
#' @param na_type a character vector of strings that will be interpreted as NA
#'
#' @details If the order argument is set to TRUE then the order of the rows is modified
#' and you cannot use the plot's x-axis to find a missing value in the dataset.
#'
#' @export
#' @import data.table
#' @import ggplot2
#'
plotMissing <- function(data, na_type = c('', ' '), order = FALSE) {
# check argument
if(!is.data.frame(data)) {
stop("'data' must be a data.frame.")
}
if(!is.data.table(data)) {
setDT(data)
}
if(!is.null(na_type)) if(!is.character(na_type))
stop("'na_type' must be a character vector.")
# set vaues in na_type to NA
if(!is.null(na_type)){
for(j in seq_along(data)){
set(data, i = which(data[[j]] %in% na_type), j = j, value = NA)
}
}
# create matrix with missing data indicator
missing_data <- as.matrix(is.na(data))
# add number of missing values to variable name
order_col <- apply(X = missing_data, MARGIN = 2, FUN = sum)
colnames(missing_data) <- paste0(colnames(missing_data), ' (', order_col, ')')
if(order) {
# reorder the columns to group the missing values
missing_data <- missing_data[, order(order_col)]
# reorder the rows to group the missing values
order_row <- apply(X = missing_data, MARGIN = 1, FUN = sum)
missing_data <- missing_data[order(-order_row), ]
}
# coerce to data table and add row indices for the x value of the plot
missing_data <- as.data.table(missing_data)
missing_data$row <- 1:nrow(missing_data)
# melt to long format to be accepted by ggplot
missing_data <- melt(
data = missing_data,
id.vars = 'row',
variable.name = 'variable',
value.name = 'missing'
)
# plot
ggplot(data = missing_data, mapping = aes(x = row, y = variable, fill = missing)) +
geom_tile(show.legend = FALSE) +
scale_fill_manual(values = c('TRUE' = 'maroon', 'FALSE' = 'transparent')) +
labs(x = 'Row number', y = 'Variable',
title = paste0('Missing values in ', deparse(substitute(data)))) +
theme_bw()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.