#' Add a shadow column to dataframe
#'
#' As an alternative to `bind_shadow()`, you can add specific individual shadow
#' columns to a dataset. These also respect the dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names, separated by commas. These also
#' respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#'
#' @return data.frame
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow(Ozone)
#' airquality %>% add_shadow(Ozone, Solar.R)
#'
add_shadow <- function(data, ...){
test_if_dots_missing(
dots_empty = missing(...),
msg = "{.fun add_shadow} requires variables to be selected"
)
shadow_df <- dplyr::select(data, ...) %>% as_shadow()
data <- tibble::as_tibble(data)
shadow_df <- tibble::as_tibble(shadow_df)
dplyr::bind_cols(data, shadow_df)
}
#' Add a shadow shifted column to a dataset
#'
#' Shadow shift missing values using only the selected variables in a dataset,
#' by specifying variable names or use dplyr `vars` and dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names separated by commas. These also
#' respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#' @param suffix suffix to add to variable, defaults to "shift"
#'
#' @return data with the added variable shifted named as `var_suffix`
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow_shift(Ozone, Solar.R)
#'
add_shadow_shift <- function(data, ..., suffix = "shift"){
# if no variables are selected use all of the variables
if (missing(...)) {
shadow_shifted_df <- purrr::map_dfc(data, impute_below)
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df), "_", suffix)
data <- tibble::as_tibble(data)
shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)
return(dplyr::bind_cols(data, shadow_shifted_df))
}
# select variables
shadow_shifted_vars <- dplyr::select(data, ...)
# shadow shift all (using purrr:map_df)
# would be good to have a way of indicating that no shift was taken at all
shadow_shifted_df <- purrr::map_dfc(shadow_shifted_vars, impute_below)
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df),"_",suffix)
data <- tibble::as_tibble(data)
shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)
return(dplyr::bind_cols(data, shadow_shifted_df))
}
#' Add a column describing presence of any missing values
#'
#' This adds a column named "any_miss" (by default) that describes whether
#' there are any missings in all of the variables (default), or whether any
#' of the specified columns, specified using variables names or dplyr verbs,
#' `starts_with`, `contains`, `ends_with`, etc. By default the added column
#' will be called "any_miss_all", if no variables are specified, otherwise,
#' if variables are specified, the label will be "any_miss_vars" to indicate
#' that not all variables have been used to create the labels.
#'
#'
#' By default the
#' prefix "any_miss" is used, but this can be changed in the `label` argument.
#'
#' @param data data.frame
#' @param ...
#'
#' Variable names to use instead of the whole dataset. By default this
#' looks at the whole dataset. Otherwise, this is one or more unquoted
#' expressions separated by commas. These also respect the dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc. By default will add "_all" to
#' the label if left blank, otherwise will add "_vars" to distinguish that it
#' has not been used on all of the variables.
#' @param label label for the column, defaults to "any_miss". By default if no
#' additional variables are listed the label col is "any_miss_all", otherwise
#' it is "any_miss_vars", if variables are specified.
#' @param missing character a label for when values are missing - defaults to "missing"
#' @param complete character character a label for when values are complete - defaults to "complete"
#'
#' @return data.frame with data and the column labelling whether that row (for
#' those variables) has any missing values - indicated by "missing" and
#' "complete".
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_any_miss()
#' airquality %>% add_any_miss(Ozone, Solar.R)
#'
add_any_miss <- function(data, ...,
label = "any_miss",
missing = "missing",
complete = "complete"){
# if no variables are specified, do for all, and add the label "all"
if (missing(...)) {
stub_data_label <- data %>%
dplyr::mutate(.temp = any_row_miss(data),
.temp_label = dplyr::if_else(condition = .temp == TRUE,
true = missing,
false = complete)) %>%
dplyr::select(.temp_label) %>%
tibble::as_tibble()
names(stub_data_label) <- paste0(label,"_all")
return(
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
)
}
stub_data <- dplyr::select(data, ...)
stub_data_label <- stub_data %>%
dplyr::mutate(.temp = any_row_miss(stub_data),
.temp_label = dplyr::if_else(condition = .temp == TRUE,
true = missing,
false = complete)) %>%
dplyr::select(.temp_label) %>%
tibble::as_tibble()
names(stub_data_label) <- paste0(label,"_vars")
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
}
#' Is there a missing value in the row of a dataframe?
#'
#' Creates a character vector describing presence/absence of missing values
#'
#' @param data a dataframe or set of vectors of the same length
#'
#' @return character vector of "Missing" and "Not Missing".
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' label_missings(airquality)
#'
#' \dontrun{
#' library(dplyr)
#'
#' airquality %>%
#' mutate(is_missing = label_missings(airquality)) %>%
#' head()
#'
#' airquality %>%
#' mutate(is_missing = label_missings(airquality,
#' missing = "definitely missing",
#' complete = "absolutely complete")) %>%
#' head()
#' }
label_missings <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
test_if_null(data)
# find which are missing and which are not.
any_row_na <- function(x){
apply(data.frame(x), MARGIN = 1, FUN = function(x) anyNA(x))
}
if (!missing(...)) {
data <- dplyr::select(data, ...)
}
temp <- any_row_na(data)
dplyr::if_else(condition = temp == TRUE, # TRUE means missing
true = missing,
false = complete)
}
#' Add a column describing if there are any missings in the dataset
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column "any_missing" that is either "Not Missing"
#' or "Missing" for the purposes of plotting / exploration / nice print methods
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_label_missings()
#' airquality %>% add_label_missings(Ozone, Solar.R)
#' airquality %>% add_label_missings(Ozone, Solar.R, missing = "yes", complete = "no")
#'
add_label_missings <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
# data %>%
# dplyr::mutate(any_missing = label_missings(.)) %>%
# dplyr::as_tibble()
updated_data <- data %>%
dplyr::mutate(any_missing = label_missings(.,
...,
missing = missing,
complete = complete))
return(tibble::as_tibble(updated_data))
}
#' Label shadow values as missing or not missing
#'
#' Powers `add_label_shadow`. For the moment it is an internal function.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return "Missing" or "Not Missing"
#' @keywords internal
#' @noRd
#'
label_shadow <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
# any_shade <- function(x) any(grepl("^NA|^NA_", x))
if (!missing(...)) {
shadow_vars <- quo_to_shade(...)
data <- dplyr::select(data, ..., !!!shadow_vars)
}
temp <- any_row_shade(data)
dplyr::if_else(condition = temp == TRUE, # TRUE means missing
true = missing,
false = complete)
}
#' Add a column describing whether there is a shadow
#'
#' Instead of focussing on labelling whether there are missings, we instead
#' focus on whether there have been any shadows created. This can be useful
#' when data has been imputed and you need to determine which rows contained
#' missing values when the shadow was bound to the dataset.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column, "any_missing", which describes whether or
#' not there are any rows that have a shadow value.
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>%
#' add_shadow(Ozone, Solar.R) %>%
#' add_label_shadow()
#'
add_label_shadow <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
if (!any_shade(data)) {
rlang::abort("add_label_shadow works with shadow data, which has columns
created by `shade()`, `as_shadow()`, or `bind_shadow()`")
}
updated_data <- dplyr::mutate(data,
any_missing = label_shadow(data,
...,
missing = missing,
complete = complete))
return(updated_data)
}
#' Add a column that tells us which "missingness cluster" a row belongs to
#'
#' A way to extract the cluster of missingness that a group belongs to.
#' For example, if you use `vis_miss(airquality, cluster = TRUE)`, you can
#' see some clustering in the data, but you do not have a way to identify
#' the cluster. Future work will incorporate the `seriation` package to
#' allow for better control over the clustering from the user.
#'
#' @param data a dataframe
#' @param cluster_method character vector of the agglomeration method to use,
#' the default is "mcquitty". Options are taken from `stats::hclust`
#' helpfile, and options include: "ward.D", "ward.D2", "single", "complete",
#' "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or
#' "centroid" (= UPGMC).
#' @param n_clusters numeric the number of clusters you expect. Defaults to 2.
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @export
#'
#' @examples
#'
#' add_miss_cluster(airquality)
#' add_miss_cluster(airquality, n_clusters = 3)
#' add_miss_cluster(airquality, cluster_method = "ward.D", n_clusters = 3)
add_miss_cluster <- function(data,
cluster_method = "mcquitty",
n_clusters = 2) {
test_if_null(data)
test_if_dataframe(data)
data_na <- is.na(data)
miss_cluster <- stats::dist(data_na*1) %>%
stats::hclust(method = cluster_method) %>%
stats::cutree(k = n_clusters)
data$miss_cluster <- miss_cluster
tibble::as_tibble(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.