R/shadows.R

Defines functions gather_shadow bind_shadow as_shadow.data.frame as_shadow

Documented in as_shadow as_shadow.data.frame bind_shadow gather_shadow

#' Create shadows
#'
#' Representing missing data structure is achieved using the shadow matrix,
#' introduced in [Swayne and Buja](https://www.researchgate.net/publication/2758672_Missing_Data_in_Interactive_High-Dimensional_Data_Visualization). The shadow
#' matrix is the same dimension as the data, and consists of binary indicators
#' of missingness of data values, where missing is represented as "NA", and not
#' missing is represented as "!NA". Although these may be represented as 1 and
#' 0, respectively.
#'
#' @param data dataframe
#' @param ... selected variables to use
#'
#' @return appended shadow with column names
#' @export

as_shadow <- function(data, ...){

  test_if_null(data)

  test_if_dataframe(data)

  UseMethod("as_shadow")
}

#' Create shadow data
#'
#' Return a tibble in shadow matrix form, where the variables are the same but
#' have a suffix _NA attached to distinguish them.
#'
#' @inheritParams as_shadow
#'
#' @examples
#'
#' as_shadow(airquality)
#'
#' @export
as_shadow.data.frame <- function(data, ...){

    data_shadow <- purrr::map_df(data, label_shadow_matrix)

    names(data_shadow) <- paste0(names(data),"_NA")

    data_shadow

}

#' Bind a shadow dataframe to original data
#'
#' Binding a shadow matrix to a regular dataframe helps visualise and work with
#' missing data.
#'
#' @param data a dataframe
#' @param only_miss logical - if FALSE (default) it will bind a dataframe with
#'     all of the variables duplicated with their shadow. Setting this to TRUE
#'     will bind variables only those variables that contain missing values.
#'     See the examples for more details.
#'
#' @return data with the added variable shifted and the suffix `_NA`
#' @export
#'
#' @examples
#'
#' bind_shadow(airquality)
#'
#' # bind only the variables that contain missing values
#' bind_shadow(airquality, only_miss = TRUE)
#'
#' aq_shadow <- bind_shadow(airquality)
#'
#' # explore missing data visually
#' library(ggplot2)
#'
#' # using the bounded shadow to visualise Ozone according to whether Solar
#' # Radiation is missing or not.
#'
#' ggplot(data = aq_shadow,
#'        aes(x = Ozone)) +
#'        geom_histogram() +
#'        facet_wrap(~Solar.R_NA,
#'        ncol = 1)
#'
bind_shadow <- function(data, only_miss = FALSE){

  # If you want only the missing values to be added
  if (only_miss) {

    # I want to only select columns that contain a missing value.
    miss_vars <- rlang::syms(which_var_na(data))

    shadow_vars <- dplyr::select(data, !!!miss_vars) %>% as_shadow()

    tibble::as_tibble(dplyr::bind_cols(data, shadow_vars))

  # if you want All the values to be added (the default behaviour)
  } else if (!only_miss) {

    data_shadow <- as_shadow(data)

    bound_shadow <- dplyr::bind_cols(data, data_shadow)

    tibble::as_tibble(bound_shadow)

  }

}

#' Long form representation of a shadow matrix
#'
#' `gather_shadow` is a long-form representation of binding the shadow matrix to
#'     your data, producing variables named `case`, `variable`, and `missing`, where
#'     `missing` contains the missing value representation.
#'
#' @param data a dataframe
#'
#' @return dataframe in long, format, containing information about the missings
#'
#' @export
#'
#' @examples
#'
#' gather_shadow(airquality)
#'
gather_shadow <- function(data){

  as_shadow(data) %>%
    dplyr::mutate(rows = 1:nrow(.)) %>%
    tidyr::gather(key = "variable",
                  value = "missing",
                  -rows) %>%
    dplyr::rename(case = rows)
}

Try the naniar package in your browser

Any scripts or data that you put into this service are public.

naniar documentation built on Feb. 9, 2018, 6:05 a.m.