R/quality.R

Defines functions plotMissing

Documented in plotMissing

#'
#' This function analyzes the data quality of a table. It gives the dimensions of the
#' table along with the number of unique values and missing values/ It also outputs a
#' table with the number of unique values, the type and the percentage of missing values
#' by variable.
#'
#' @param data the table to analyzed
#' @param numeric_cutoff an integer specifying the minimal number of unique values
#'   necessary for a vector not to be considered a factor. Default to -1
#' @param na_type a character vector of strings that will be interpreted as NA
#'
#' @param return a list with two elements, one with global information (dimensions, unique
#'   values and missing values) and a table with information for each variable.
#'   
#' @export
#' @import data.table
#'   
quality <- function (data, numeric_cutoff = -1, na_type = c("", " ")){
  # Arguments check
  if(!is.data.frame(data) & !is.data.table(data)) 
    stop("'data' must either be a data.frame or a data.table.")
  if(!is.data.table(data)) data <- as.data.table(data)
  if(!(is.numeric(numeric_cutoff) & length(numeric_cutoff) == 1)) 
    stop("'numeric_cutoff' must be numeric of length one.")
  if(!is.null(na_type)) if(!is.character(na_type)) 
    stop("'na_type' must be a character vector.")
  
  # set vaues in na_type to NA
  if(!is.null(na_type)){
    for(j in seq_along(data)){
      set(data, i = which(data[[j]] %in% na_type), j = j, value = NA)
    }
  }
  
  # is_** function to define type
  is_categorical <- function(x) is.factor(x) || is.character(x) || uniqueN(x) <= numeric_cutoff
  is_numeric <- function(x) is.numeric(x) & uniqueN(x) > numeric_cutoff
  is_date <- function(x) inherits(x, 'Date') | inherits(x, 'POSIXct') | inherits(x, 'POSIXlt')
  
  # global info
  n_cols <- ncol(data)
  n_rows <- nrow(data)
  n_unique <- nrow(unique(data))
  n_missing <- sum(is.na(data))
  
  # columns types
  categorical_var <- which(sapply(X = colnames(data), 
                                  FUN = function(name) is_categorical(data[[name]])) == TRUE)
  numeric_var <- which(sapply(X = colnames(data),
                              FUN = function(name) is_numeric(data[[name]])) == TRUE)
  date_var <- which(sapply(X = colnames(data),
                           FUN = function(name) is_date(data[[name]])) == TRUE)
  logical_var <- which(sapply(X = colnames(data),
                              FUN = function(name) is.logical(data[[name]])) == TRUE)
  
  # data quality output
  types <- rep(x = "undefined", length = n_cols)
  types[categorical_var] <- "character"
  types[numeric_var] <- "numeric"
  types[date_var] <- "date"
  types[logical_var] <- 'logical'
  n_miss <- colSums(is.na(data))
  percent_miss <- 100 * n_miss / n_rows
  n_unique_values <- sapply(data, uniqueN)
  output_global <- data.table(
    names(n_miss), 
    types, 
    n_miss, 
    as.numeric(format(percent_miss, digits = 0)), 
    n_unique_values
  )
  colnames(output_global) <- c("Variable", "Type", "Missing values", 
                               "Percentage of missing values", "Unique values")
  return(list(global = list(n_cols = n_cols, n_rows = n_rows, n_unique = n_unique, 
                            n_missing = n_missing),
              table = output_global))
}

#'
#' This function plots the missing values in a dataset. If a value is missing in a column
#' for a given row then a bar is displayed.
#'
#' @param data a data.frame
#' @param order logical, whether to order the columns and rows to display the missing
#'   values next to each other, defautl to FALSE.
#' @param na_type a character vector of strings that will be interpreted as NA
#'
#' @details If the order argument is set to TRUE then the order of the rows is modified
#'   and you cannot use the plot's x-axis to find a missing value in the dataset.
#'   
#' @export 
#' @import data.table
#' @import ggplot2
#' 
plotMissing <- function(data, na_type = c('', ' '), order = FALSE) {
  # check argument
  if(!is.data.frame(data)) {
    stop("'data' must be a data.frame.")
  }
  if(!is.data.table(data)) {
    setDT(data)
  }
  if(!is.null(na_type)) if(!is.character(na_type)) 
    stop("'na_type' must be a character vector.")
  
  # set vaues in na_type to NA
  if(!is.null(na_type)){
    for(j in seq_along(data)){
      set(data, i = which(data[[j]] %in% na_type), j = j, value = NA)
    }
  }
  
  # create matrix with missing data indicator
  missing_data <- as.matrix(is.na(data))
  
  # add number of missing values to variable name
  order_col <- apply(X = missing_data, MARGIN = 2, FUN = sum)
  colnames(missing_data) <- paste0(colnames(missing_data), ' (', order_col, ')')
  
  if(order) {
    # reorder the columns to group the missing values
    missing_data <- missing_data[, order(order_col)]
    
    # reorder the rows to group the missing values
    order_row <- apply(X = missing_data, MARGIN = 1, FUN = sum)
    missing_data <- missing_data[order(-order_row), ]
  }
  
  # coerce to data table and add row indices for the x value of the plot
  missing_data <- as.data.table(missing_data)
  missing_data$row <- 1:nrow(missing_data)
  
  # melt to long format to be accepted by ggplot
  missing_data <- melt(
    data = missing_data,
    id.vars = 'row',
    variable.name = 'variable',
    value.name = 'missing'
  )
  
  # plot
  ggplot(data = missing_data, mapping = aes(x = row, y = variable, fill = missing)) + 
    geom_tile(show.legend = FALSE)  +
    scale_fill_manual(values = c('TRUE' = 'maroon', 'FALSE' = 'transparent')) + 
    labs(x = 'Row number', y = 'Variable', 
         title = paste0('Missing values in ', deparse(substitute(data)))) + 
    theme_bw()
}
MathieuMarauri/explorer documentation built on Jan. 8, 2020, 6:37 p.m.