R/add-cols.R
In naniar: Data Structures, Summaries, and Visualisations for Missing Data

Documented in add_any_miss add_label_missings add_label_shadow add_miss_cluster add_shadow add_shadow_shift label_missings

#' Add a shadow column to dataframe
#'
#' As an alternative to `bind_shadow()`, you can add specific individual shadow
#'   columns to a dataset. These also respect the dplyr verbs
#'   `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names, separated by commas. These also
#'   respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#'
#' @return data.frame
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow(Ozone)
#' airquality %>% add_shadow(Ozone, Solar.R)
#'
add_shadow <- function(data, ...){

  test_if_dots_missing(
    dots_empty = missing(...),
    msg = "{.fun add_shadow} requires variables to be selected"
    )
  shadow_df <- dplyr::select(data, ...) %>% as_shadow()

  data <- tibble::as_tibble(data)
  shadow_df <- tibble::as_tibble(shadow_df)
  dplyr::bind_cols(data, shadow_df)
}

#' Add a shadow shifted column to a dataset
#'
#' Shadow shift missing values using only the selected variables in a dataset,
#'   by specifying variable names or use dplyr `vars` and dplyr verbs
#'   `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names separated by commas. These also
#'   respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#' @param suffix suffix to add to variable, defaults to "shift"
#'
#' @return data with the added variable shifted named as `var_suffix`
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow_shift(Ozone, Solar.R)
#'
add_shadow_shift <- function(data, ..., suffix = "shift"){

  # if no variables are selected use all of the variables
  if (missing(...)) {

    shadow_shifted_df <- purrr::map_dfc(data, impute_below)

    # change names
    names(shadow_shifted_df) <- paste0(names(shadow_shifted_df), "_", suffix)

    data <- tibble::as_tibble(data)
    shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)

    return(dplyr::bind_cols(data, shadow_shifted_df))
  }

  # select variables
  shadow_shifted_vars <- dplyr::select(data, ...)

  # shadow shift all (using purrr:map_df)
  # would be good to have a way of indicating that no shift was taken at all
  shadow_shifted_df <- purrr::map_dfc(shadow_shifted_vars, impute_below)

  # change names
  names(shadow_shifted_df) <- paste0(names(shadow_shifted_df),"_",suffix)

  data <- tibble::as_tibble(data)
  shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)

  return(dplyr::bind_cols(data, shadow_shifted_df))
}

#' Add a column describing presence of any missing values
#'
#' This adds a column named "any_miss" (by default) that describes whether
#'   there are any missings in all of the variables (default), or whether any
#'   of the specified columns, specified using variables names or dplyr verbs,
#'   `starts_with`, `contains`, `ends_with`, etc. By default the added column
#'   will be called "any_miss_all", if no variables are specified, otherwise,
#'   if variables are specified, the label will be "any_miss_vars" to indicate
#'   that not all variables have been used to create the labels.
#'
#'
#'   By default the
#'   prefix "any_miss" is used, but this can be changed in the `label` argument.
#'
#' @param data data.frame
#' @param ...
#'
#' Variable names to use instead of the whole dataset. By default this
#'   looks at the whole dataset. Otherwise, this is one or more unquoted
#'   expressions separated by commas. These also respect the dplyr verbs
#'   `starts_with`, `contains`, `ends_with`, etc. By default will add "_all" to
#'   the label if left blank, otherwise will add "_vars" to distinguish that it
#'   has not been used on all of the variables.
#' @param label label for the column, defaults to "any_miss". By default if no
#'   additional variables are listed the label col is "any_miss_all", otherwise
#'   it is "any_miss_vars", if variables are specified.
#' @param missing character a label for when values are missing - defaults to "missing"
#' @param complete character character a label for when values are complete - defaults to "complete"
#'
#' @return data.frame with data and the column labelling whether that row (for
#'     those variables) has any missing values - indicated by "missing" and
#'     "complete".
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_any_miss()
#' airquality %>% add_any_miss(Ozone, Solar.R)
#'
add_any_miss <- function(data, ...,
                         label = "any_miss",
                         missing = "missing",
                         complete = "complete"){

  # if no variables are specified, do for all, and add the label "all"
  if (missing(...)) {

    stub_data_label <- data %>%
      dplyr::mutate(.temp = any_row_miss(data),
                    .temp_label = dplyr::if_else(condition = .temp == TRUE,
                                                 true = missing,
                                                 false = complete)) %>%
      dplyr::select(.temp_label) %>%
      tibble::as_tibble()

    names(stub_data_label) <- paste0(label,"_all")

    return(
      dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
    )

  }

  stub_data <- dplyr::select(data, ...)

  stub_data_label <- stub_data %>%
    dplyr::mutate(.temp = any_row_miss(stub_data),
                  .temp_label = dplyr::if_else(condition = .temp == TRUE,
                                               true = missing,
                                               false = complete)) %>%
    dplyr::select(.temp_label) %>%
    tibble::as_tibble()

  names(stub_data_label) <- paste0(label,"_vars")

  dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
}

#' Is there a missing value in the row of a dataframe?
#'
#' Creates a character vector describing presence/absence of missing values
#'
#' @param data a dataframe or set of vectors of the same length
#'
#' @return character vector of "Missing" and "Not Missing".
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' label_missings(airquality)
#'
#' \dontrun{
#' library(dplyr)
#'
#' airquality %>%
#'   mutate(is_missing = label_missings(airquality)) %>%
#'   head()
#'
#' airquality %>%
#'   mutate(is_missing = label_missings(airquality,
#'                                      missing = "definitely missing",
#'                                      complete = "absolutely complete")) %>%
#'   head()
#' }
label_missings <- function(data,
                           ...,
                           missing = "Missing",
                           complete = "Not Missing"){

  test_if_null(data)
  # find which are missing and which are not.

  any_row_na <- function(x){
    apply(data.frame(x), MARGIN = 1, FUN = function(x) anyNA(x))
  }

  if (!missing(...)) {
    data <- dplyr::select(data, ...)
  }

  temp <- any_row_na(data)

  dplyr::if_else(condition = temp == TRUE, # TRUE means missing
                 true = missing,
                 false = complete)

}

#' Add a column describing if there are any missings in the dataset
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column "any_missing" that is either "Not Missing"
#'   or "Missing" for the purposes of plotting / exploration / nice print methods
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_label_missings()
#' airquality %>% add_label_missings(Ozone, Solar.R)
#' airquality %>% add_label_missings(Ozone, Solar.R, missing = "yes", complete = "no")
#'
add_label_missings <- function(data,
                               ...,
                               missing = "Missing",
                               complete = "Not Missing"){

  # data %>%
  #   dplyr::mutate(any_missing = label_missings(.)) %>%
  #   dplyr::as_tibble()

  updated_data <- data %>%
    dplyr::mutate(any_missing = label_missings(.,
                                               ...,
                                               missing = missing,
                                               complete = complete))


  return(tibble::as_tibble(updated_data))

}

#' Label shadow values as missing or not missing
#'
#' Powers `add_label_shadow`. For the moment it is an internal function.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return "Missing" or "Not Missing"
#' @keywords internal
#' @noRd
#'
label_shadow <- function(data,
                         ...,
                         missing = "Missing",
                         complete = "Not Missing"){

  # any_shade <- function(x) any(grepl("^NA|^NA_", x))

  if (!missing(...)) {
    shadow_vars <- quo_to_shade(...)

    data <- dplyr::select(data, ..., !!!shadow_vars)
  }

  temp <- any_row_shade(data)
  dplyr::if_else(condition = temp == TRUE, # TRUE means missing
                 true = missing,
                 false = complete)

}

#' Add a column describing whether there is a shadow
#'
#' Instead of focussing on labelling whether there are missings, we instead
#'   focus on whether there have been any shadows created. This can be useful
#'   when data has been imputed and you need to determine which rows contained
#'   missing values when the shadow was bound to the dataset.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column, "any_missing", which describes whether or
#'   not there are any rows that have a shadow value.
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>%
#'   add_shadow(Ozone, Solar.R) %>%
#'   add_label_shadow()
#'
add_label_shadow <- function(data,
                             ...,
                             missing = "Missing",
                             complete = "Not Missing"){

  if (!any_shade(data)) {
    rlang::abort("add_label_shadow works with shadow data, which has columns
                 created by `shade()`, `as_shadow()`, or `bind_shadow()`")
  }

  updated_data <- dplyr::mutate(data,
                                any_missing = label_shadow(data,
                                                           ...,
                                                           missing = missing,
                                                           complete = complete))


  return(updated_data)
}


#' Add a column that tells us which "missingness cluster" a row belongs to
#'
#' A way to extract the cluster of missingness that a group belongs to.
#'     For example, if you use `vis_miss(airquality, cluster = TRUE)`, you can
#'     see some clustering in the data, but you do not have a way to identify
#'     the cluster. Future work will incorporate the `seriation` package to
#'     allow for better control over the clustering from the user.
#'
#' @param data a dataframe
#' @param cluster_method character vector of the agglomeration method to use,
#'    the default is "mcquitty". Options are taken from `stats::hclust`
#'    helpfile, and options include: "ward.D", "ward.D2", "single", "complete",
#'    "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or
#'    "centroid" (= UPGMC).
#' @param n_clusters numeric the number of clusters you expect. Defaults to 2.
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @export
#'
#' @examples
#'
#' add_miss_cluster(airquality)
#' add_miss_cluster(airquality, n_clusters = 3)
#' add_miss_cluster(airquality, cluster_method = "ward.D", n_clusters = 3)

add_miss_cluster <- function(data,
                             cluster_method = "mcquitty",
                             n_clusters = 2) {

  test_if_null(data)

  test_if_dataframe(data)

  data_na <- is.na(data)

  miss_cluster <- stats::dist(data_na*1) %>%
    stats::hclust(method = cluster_method) %>%
    stats::cutree(k = n_clusters)

  data$miss_cluster <- miss_cluster
  tibble::as_tibble(data)
}

Any scripts or data that you put into this service are public.

naniar documentation built on May 29, 2024, 1:43 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

naniar
Data Structures, Summaries, and Visualisations for Missing Data

R/add-cols.R
In naniar: Data Structures, Summaries, and Visualisations for Missing Data

Defines functions add_miss_cluster add_label_shadow label_shadow add_label_missings label_missings add_any_miss add_shadow_shift add_shadow

Documented in add_any_miss add_label_missings add_label_shadow add_miss_cluster add_shadow add_shadow_shift label_missings

Try the naniar package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

naniar Data Structures, Summaries, and Visualisations for Missing Data

R/add-cols.R In naniar: Data Structures, Summaries, and Visualisations for Missing Data

Defines functions add_miss_cluster add_label_shadow label_shadow add_label_missings label_missings add_any_miss add_shadow_shift add_shadow

Documented in add_any_miss add_label_missings add_label_shadow add_miss_cluster add_shadow add_shadow_shift label_missings

Try the naniar package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

naniar
Data Structures, Summaries, and Visualisations for Missing Data

R/add-cols.R
In naniar: Data Structures, Summaries, and Visualisations for Missing Data