# Data Quality ------------------------------------------------------------
#' Presenter-ready excel sheet detailing key metrics of each column of a dataset.
#'
#' @param input A data.frame, a list of data.frames, a filename or a directory name.
#' @param pattern A character string to match filenames in the directory.
#' @return A Microsoft Excel report detailing important qualitative and
#' quantitative characteristics for each table entered. Each Excel sheet
#' will contain the type of each variable, a header, summary satatistis
#' for all numeric variables, and counts for each categorical variable.
#'
#'
data_quality_report <- function(input = getwd(), pattern = NULL,...){
data <- detect_input(input, dir.pattern = pattern,...)
cat('\nData Quality - GoQ')
# cat(paste(readr::read_lines('GoQ.txt'), collapse = '\n'), '\n')
cat('\nCreating data quality reports for each file...\n')
# processing for a single data.frame
if(is.data.frame(data)){
excel_output <- list(data_quality(data)) %>% purrr::set_names(deparse(substitute(data)))
}
if(all(map_lgl(data, ~is.data.frame(.x)))){
excel_output <- parallel::mclapply(FUN = data_quality, X = data, mc.cores = max(length(data),1))
}
cat('\nWriting results...\n')
write_dq(excel_output, pattern = pattern)
cat('Execution completed.')
# return(excel_output)
}
#' creates a list of data quality tables for a data frame
#'
#' @param data The table to be analysed.
#'
#' @return A list containing data.tables for the type of each variable,
#' a header, summary satatistis for all numeric variables, and counts
#' for each categorical variable.
data_quality <- function(data){
data.table::setDT(data)
data_quality_list <- list()
# top 20 rows
data_quality_list$header <- head(data,20)
# identifies variable types
data_quality_list$data_classes <- data.table(Variable = colnames(data), Class = map_chr(data, typeof))
# splits into numerical and categorical subsets of the data
numerical_data <- keep(data, is.numeric)
categorical_data <- discard(data, is.numeric)
# numerical variables
if(length(numerical_data) > 0){
numerical_data_summary <- map(.x = numerical_data,
.f = num_summary) %>% data.table::rbindlist()
colnames(numerical_data_summary) <-c("n","n_distinct","NA's",'valid data %',"sum", "mean","sd", "min",
"1%", "5%", "25%","50%","75%", "95%", "99%", "99.9%", "99.99%", "max")
data_quality_list$numerical_data_summary <- cbind(data.table(variable = colnames(numerical_data)), numerical_data_summary)
}
# DQ categorical data
if(length(categorical_data) > 0){
categorical_data_summary <- map_df(categorical_data, cat_summary)
data_quality_list$categorical_data_summary <- cbind(data.table(variable = colnames(categorical_data)))
data_quality_list$categorical_headers <- mclapply(categorical_data, categorical_var_summary, mc.cores = 12)
}
# final output
# dq_objects <- c('data_classes',
# 'header',
# 'numerical_data_summary',
# 'categorical_data_summary',
# 'categorical_headers')
# data_quality_list <- dq_objects %>%
# purrr::keep(exists, envir=sys.frame(which = 0)) %>%
# purrr::map(get) %>%
# purrr::set_names(dq_objects %>% purrr::keep(exists, envir=sys.frame(which = 0)))
return(data_quality_list)
}
#' Data quality summary for numerical data
#'
#' @param x Numeric vector
#' @return A data.table with the following metrics for the vector:
#' count of valid datapoints, count of distinct values, count of NAs,
#' percentage of valid data, sum, mean, stdev, min, quantiles for key cutoff points,
#' and maximum value.
num_summary <- function(x){
x <- data.table(value = x)
num_sum <- x[, .(sum(!is.na(value)),
n_distinct = uniqueN(value),
NAs = sum(is.na(value)),
valid_data_percentage = sum(!is.na(value))/length(value),
Sum = sum(value, na.rm = TRUE),
Mean = mean(value, na.rm =
TRUE),
Standdev = sd(value, na.rm = TRUE),
Minimum = min(value, na.rm = TRUE),
P1 = round(quantile(
value, na.rm = TRUE, probs = (0.01)
), digits = 2),
P5 = round(quantile(
value, na.rm = TRUE, probs = (0.05)
), digits = 2),
P25 = round(quantile(
value, na.rm = TRUE, probs = (0.25)
), digits = 2),
P50 = round(quantile(
value, na.rm = TRUE, probs = (0.50)
), digits = 2),
P75 = round(quantile(
value, na.rm = TRUE, probs = (0.75)
), digits = 2),
P95 = round(quantile(
value, na.rm = TRUE, probs = (0.95)
), digits = 2),
P99 = round(quantile(
value, na.rm = TRUE, probs = (0.99)
), digits = 2),
P99.9 = round(quantile(
value, na.rm = TRUE, probs = (0.999)
), digits = 2),
P99.99 = round(quantile(
value, na.rm = TRUE, probs = (0.9999)
), digits = 2),
Max = max(value, na.rm = TRUE))]
return(num_sum)
}
#' Data quality summary for a vector of categorical data
#'
#' @param x Character vector
#' @return A data.table with the count of observations, distinct values
#' and NAs for the character vector
cat_summary <- function(x){
x <- data.table::data.table(value = x)
cat_sum <- x[, .(n = sum(!is.na(value)),
n_distinct = data.table::uniqueN(value),
NAs = sum(is.na(value)))]
return(cat_sum)
}
#' Data quality summary for a vector of categorical data
#'
#' @param x Character vector
#' @return A data.table with the counts and frecuency of the top 20
#' values of the vector, along with the count and frecuency of an
#' 'Others' and 'Total' aggregated categories
categorical_var_summary <- function(x){
dt <- data.table::data.table(value = x, key = 'value')
cat_var_sum <- dt[, .N, value # count
][,perc := N/sum(N)
][order(-N)
][dt[, .N, value][order(-N)][, .I > 20], ':='(N = sum(N), perc = sum(perc), value = 'Others') # data.table magic ??? See Details
][1:21] %>% na.omit()
cat_var_sum <- cat_var_sum %>% rbind(cat_var_sum[, .(value = 'Total', N = sum(N), perc = 1)], use.names = TRUE, fill = TRUE)
}
#' Creates an excel workboook containing the dq reports for each table
#' @param excel_output A list of outputs from data_quality()
#' @param tablestyle A character detailing the Excel table style wanted
#' @param pattern Only included here to pass onto filename
write_dq <- function(excel_output, tablestyle = 'TableStyleMedium2', pattern = NULL){
wb <- createWorkbook()
# mcmapply(FUN = write_dq_sheet, excel_output, names(excel_output), MoreArgs = list(wb = wb, tablestyle = 'TableStyleMedium4'))
walk2(excel_output, names(excel_output), write_dq_sheet, wb = wb, tablestyle = tablestyle)
wb_title <- paste0('data_quality_', pattern,
substring(gsub('[: ]', '-', as.character(Sys.time())),1,16),
'.xlsx')
saveWorkbook(wb, wb_title, overwrite = TRUE)
}
#' Writes the list resulting from data_quality() into a openxlsx::wb object
#' @param sheet_data output from data_quality to be printed onto the sheet
#' @param sheetname Name of the sheet in the workbook
#' @param wb wb object to add the sheet into
#' @param tablestyle A character detailing the Excel table style wanted
write_dq_sheet <- function(sheet_data, sheetname, wb, tablestyle = 'TableStyleMedium4'){
row_num <- 1
modifyBaseFont(wb, fontName = 'Mark Offc For MC')
addWorksheet(wb, sheetname, gridLines = TRUE)
showGridLines(wb, sheetname, showGridLines = FALSE)
setColWidths(wb, sheetname, cols = 1:1000, widths = 'auto')
header1 <- createStyle(fontName = 'Mark Offc For MC',
fontSize = 15,
textDecoration = 'bold',
border = c('top', 'bottom'))
header2 <- createStyle(fontName = 'Mark Offc For MC',
textDecoration = 'bold',
fontColour = 'white',
border = c('top', 'bottom')
)
# data_classes ----
writeData(wb, sheetname, x = 'Variable Classes', headerStyle = header1)
writeDataTable(wb,
sheetname,
x = sheet_data$data_classes %>% as.data.frame(),
startRow = 2,
colNames = T,
keepNA = TRUE,
tableStyle = tablestyle,
headerStyle = header2, )
row_num <- row_num + nrow(sheet_data$data_classes) + 3
# first 20 rows ----
writeData(wb, sheetname, x = 'First 20 rows', startRow = row_num, headerStyle = header1)
writeDataTable(wb,
sheetname,
x = sheet_data$header,
startRow = row_num + 1,
keepNA = TRUE,
tableStyle = tablestyle,
headerStyle = header2)
row_num <- row_num + nrow(sheet_data$head) + 3
# numerical variables summary ----
if('data.frame' %in% class(sheet_data$numerical_data_summary)){
writeData(wb, sheetname, x = 'Numerical Variables', startRow = row_num, headerStyle = header1)
writeDataTable(wb,
sheetname,
x = sheet_data$numerical_data_summary,
startRow = row_num + 1,
keepNA = TRUE,
tableStyle = tablestyle,
headerStyle = header2)
row_num <- row_num + nrow(sheet_data$numerical_data_summary) + 3
}
# categorical vars summary ----
if('data.frame' %in% class(sheet_data$categorical_data_summary)){
writeData(wb, sheetname, x = 'Categorical Variables', startRow = row_num, headerStyle = header1)
writeDataTable(wb,
sheetname,
x = as.data.frame(sheet_data$categorical_data_summary),
startRow = row_num + 1,
keepNA = TRUE,
tableStyle = tablestyle,
headerStyle = header2)
row_num <- row_num + nrow(sheet_data$categorical_data_summary) + 3
}
# individual categorical variables ----
if('data.frame' %in% class(sheet_data$categorical_data_summary)){
for (j in seq_along(sheet_data$categorical_headers)) {
writeData(wb, sheetname, x = names(sheet_data$categorical_headers)[j], startRow = row_num, headerStyle = header1)
writeDataTable(wb,
sheetname,
x = sheet_data$categorical_headers[[j]],
startRow = row_num + 1,
keepNA = TRUE,
tableStyle = tablestyle,
headerStyle = header2)
row_num <- row_num + nrow(sheet_data$categorical_headers[[j]]) + 3
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.