R/data_quality_report.R

Defines functions data_quality_report data_quality num_summary

# Data Quality ------------------------------------------------------------

#' Presenter-ready excel sheet detailing key metrics of each column of a dataset.
#'
#' @param input A data.frame, a list of data.frames, a filename or a directory name.
#' @param pattern A character string to match filenames in the directory.
#' @return A Microsoft Excel report detailing important qualitative and
#' quantitative characteristics for each table entered. Each Excel sheet
#' will contain the type of each variable, a header, summary satatistis
#' for all numeric variables, and counts for each categorical variable.
#'
#'
data_quality_report <- function(input = getwd(), pattern = NULL,...){
  data <- detect_input(input, dir.pattern = pattern,...)

  cat('\nData Quality - GoQ')
  # cat(paste(readr::read_lines('GoQ.txt'), collapse = '\n'), '\n')

  cat('\nCreating data quality reports for each file...\n')

  # processing for a single data.frame
  if(is.data.frame(data)){
    excel_output <- list(data_quality(data)) %>% purrr::set_names(deparse(substitute(data)))
  }

  if(all(map_lgl(data, ~is.data.frame(.x)))){
    excel_output <- parallel::mclapply(FUN = data_quality, X = data, mc.cores = max(length(data),1))
  }

  cat('\nWriting results...\n')
  write_dq(excel_output, pattern = pattern)

  cat('Execution completed.')
  # return(excel_output)
}

#' creates a list of data quality tables for a data frame
#'
#' @param data The table to be analysed.
#'
#' @return A list containing data.tables for the type of each variable,
#' a header, summary satatistis for all numeric variables, and counts
#' for each categorical variable.
data_quality <- function(data){
  data.table::setDT(data)
  data_quality_list <- list()

  # top 20 rows
  data_quality_list$header <-  head(data,20)

  # identifies variable types
  data_quality_list$data_classes <- data.table(Variable = colnames(data), Class = map_chr(data, typeof))

  # splits into numerical and categorical subsets of the data
  numerical_data <- keep(data, is.numeric)
  categorical_data <- discard(data, is.numeric)

  # numerical variables
  if(length(numerical_data) > 0){
    numerical_data_summary <- map(.x = numerical_data,
                                  .f = num_summary) %>% data.table::rbindlist()

    colnames(numerical_data_summary) <-c("n","n_distinct","NA's",'valid data %',"sum", "mean","sd", "min",
                                         "1%", "5%", "25%","50%","75%", "95%", "99%", "99.9%", "99.99%", "max")
    data_quality_list$numerical_data_summary <- cbind(data.table(variable = colnames(numerical_data)), numerical_data_summary)
  }

  # DQ categorical  data
  if(length(categorical_data) > 0){
    categorical_data_summary <- map_df(categorical_data, cat_summary)
    data_quality_list$categorical_data_summary <- cbind(data.table(variable = colnames(categorical_data)))

    data_quality_list$categorical_headers <- mclapply(categorical_data, categorical_var_summary, mc.cores = 12)
  }
  # final output
  # dq_objects <- c('data_classes',
  #                 'header',
  #                 'numerical_data_summary',
  #                 'categorical_data_summary',
  #                 'categorical_headers')
  # data_quality_list <- dq_objects %>%
  #   purrr::keep(exists, envir=sys.frame(which = 0)) %>%
  #   purrr::map(get) %>%
  #   purrr::set_names(dq_objects %>% purrr::keep(exists, envir=sys.frame(which = 0)))

  return(data_quality_list)
}

#' Data quality summary for numerical data
#'
#' @param x Numeric vector
#' @return A data.table with the following metrics for the vector:
#' count of valid datapoints, count of distinct values, count of NAs,
#' percentage of valid data, sum, mean, stdev, min, quantiles for key cutoff points,
#' and maximum value.
num_summary <- function(x){
  x <- data.table(value = x)
  num_sum <- x[, .(sum(!is.na(value)),
                   n_distinct = uniqueN(value),
                   NAs = sum(is.na(value)),
                   valid_data_percentage = sum(!is.na(value))/length(value),
                   Sum = sum(value, na.rm = TRUE),
                   Mean = mean(value, na.rm =
                                 TRUE),
                   Standdev = sd(value, na.rm = TRUE),
                   Minimum = min(value, na.rm = TRUE),
                   P1 = round(quantile(
                     value, na.rm = TRUE, probs = (0.01)
                   ), digits = 2),
                   P5 = round(quantile(
                     value, na.rm = TRUE, probs = (0.05)
                   ), digits = 2),
                   P25 = round(quantile(
                     value, na.rm = TRUE, probs = (0.25)
                   ), digits = 2),
                   P50 = round(quantile(
                     value, na.rm = TRUE, probs = (0.50)
                   ), digits = 2),
                   P75 = round(quantile(
                     value, na.rm = TRUE, probs = (0.75)
                   ), digits = 2),
                   P95 = round(quantile(
                     value, na.rm = TRUE, probs = (0.95)
                   ), digits = 2),
                   P99 = round(quantile(
                     value, na.rm = TRUE, probs = (0.99)
                   ), digits = 2),
                   P99.9 = round(quantile(
                     value, na.rm = TRUE, probs = (0.999)
                   ), digits = 2),
                   P99.99 = round(quantile(
                     value, na.rm = TRUE, probs = (0.9999)
                   ), digits = 2),
                   Max = max(value, na.rm = TRUE))]
  return(num_sum)
}

#' Data quality summary for a vector of categorical data
#'
#' @param x Character vector
#' @return A data.table with the count of observations, distinct values
#' and NAs for the character vector
cat_summary <- function(x){
  x <- data.table::data.table(value = x)
  cat_sum <- x[, .(n = sum(!is.na(value)),
                   n_distinct = data.table::uniqueN(value),
                   NAs = sum(is.na(value)))]
  return(cat_sum)
}

#' Data quality summary for a vector of categorical data
#'
#' @param x Character vector
#' @return A data.table with the counts and frecuency of the top 20
#' values of the vector, along with the count and frecuency of an
#' 'Others' and  'Total' aggregated categories
categorical_var_summary <- function(x){
  dt <- data.table::data.table(value = x, key = 'value')
  cat_var_sum <- dt[, .N, value # count
                    ][,perc := N/sum(N)
                      ][order(-N)
                        ][dt[, .N, value][order(-N)][, .I > 20], ':='(N = sum(N), perc = sum(perc), value = 'Others') # data.table magic ??? See Details
                          ][1:21] %>% na.omit()
  cat_var_sum <- cat_var_sum %>% rbind(cat_var_sum[, .(value = 'Total', N = sum(N), perc = 1)], use.names = TRUE, fill = TRUE)
}

#' Creates an excel workboook containing the dq reports for each table
#' @param excel_output A list of outputs from data_quality()
#' @param tablestyle A character detailing the Excel table style wanted
#' @param pattern Only included here to pass onto filename
write_dq <- function(excel_output, tablestyle = 'TableStyleMedium2', pattern = NULL){
  wb <- createWorkbook()
  # mcmapply(FUN = write_dq_sheet, excel_output, names(excel_output), MoreArgs = list(wb = wb, tablestyle = 'TableStyleMedium4'))
  walk2(excel_output, names(excel_output), write_dq_sheet, wb = wb, tablestyle = tablestyle)
  wb_title <- paste0('data_quality_', pattern,
                     substring(gsub('[: ]', '-', as.character(Sys.time())),1,16),
                     '.xlsx')
  saveWorkbook(wb, wb_title, overwrite = TRUE)
}

#' Writes the list resulting from data_quality() into a openxlsx::wb object
#' @param sheet_data output from data_quality to be printed onto the sheet
#' @param sheetname  Name of the sheet in the workbook
#' @param wb wb object to add the sheet into
#' @param tablestyle A character detailing the Excel table style wanted
write_dq_sheet <-  function(sheet_data, sheetname, wb, tablestyle = 'TableStyleMedium4'){
  row_num <- 1
  modifyBaseFont(wb, fontName = 'Mark Offc For MC')
  addWorksheet(wb, sheetname, gridLines = TRUE)
  showGridLines(wb, sheetname, showGridLines = FALSE)
  setColWidths(wb, sheetname, cols = 1:1000, widths = 'auto')

  header1 <- createStyle(fontName = 'Mark Offc For MC',
                         fontSize = 15,
                         textDecoration = 'bold',
                         border = c('top', 'bottom'))

  header2 <- createStyle(fontName = 'Mark Offc For MC',
                         textDecoration = 'bold',
                         fontColour = 'white',
                         border = c('top', 'bottom')
  )

  # data_classes ----
  writeData(wb, sheetname, x = 'Variable Classes', headerStyle = header1)
  writeDataTable(wb,
                 sheetname,
                 x  = sheet_data$data_classes %>% as.data.frame(),
                 startRow = 2,
                 colNames = T,
                 keepNA = TRUE,
                 tableStyle = tablestyle,
                 headerStyle = header2, )

  row_num <- row_num + nrow(sheet_data$data_classes) + 3

  # first 20 rows ----
  writeData(wb, sheetname, x = 'First 20 rows', startRow = row_num, headerStyle = header1)
  writeDataTable(wb,
                 sheetname,
                 x = sheet_data$header,
                 startRow = row_num + 1,
                 keepNA = TRUE,
                 tableStyle = tablestyle,
                 headerStyle = header2)
  row_num <- row_num + nrow(sheet_data$head) + 3

  # numerical variables summary ----
  if('data.frame' %in% class(sheet_data$numerical_data_summary)){
    writeData(wb, sheetname, x = 'Numerical Variables', startRow = row_num, headerStyle = header1)
    writeDataTable(wb,
                   sheetname,
                   x = sheet_data$numerical_data_summary,
                   startRow = row_num + 1,
                   keepNA = TRUE,
                   tableStyle = tablestyle,
                   headerStyle = header2)
    row_num <- row_num + nrow(sheet_data$numerical_data_summary) + 3
  }

  # categorical vars summary ----
  if('data.frame' %in% class(sheet_data$categorical_data_summary)){
    writeData(wb, sheetname, x = 'Categorical Variables', startRow = row_num, headerStyle = header1)
    writeDataTable(wb,
                   sheetname,
                   x = as.data.frame(sheet_data$categorical_data_summary),
                   startRow = row_num + 1,
                   keepNA = TRUE,
                   tableStyle = tablestyle,
                   headerStyle = header2)
    row_num <- row_num + nrow(sheet_data$categorical_data_summary) + 3
  }


  # individual categorical variables ----
  if('data.frame' %in% class(sheet_data$categorical_data_summary)){
    for (j in seq_along(sheet_data$categorical_headers)) {
      writeData(wb, sheetname, x = names(sheet_data$categorical_headers)[j], startRow = row_num, headerStyle = header1)
      writeDataTable(wb,
                     sheetname,
                     x = sheet_data$categorical_headers[[j]],
                     startRow = row_num + 1,
                     keepNA = TRUE,
                     tableStyle = tablestyle,
                     headerStyle = header2)
      row_num <- row_num + nrow(sheet_data$categorical_headers[[j]]) + 3
    }
  }

}
pheymanss/dq documentation built on March 12, 2020, 1:29 a.m.