R/smdi_asmd.R
In smdi: Perform Structural Missing Data Investigations

Documented in smdi_asmd

#' Computes mean/median absolute standardized mean differences between observed and missing observations
#'
#' @description
#' This function takes a dataframe with covariates which are partially observed/missing and returns the
#' median/average absolute standardized mean difference (asmd) and more details for every specified covariate in covar
#' (if NULL all covariates with at least one NA are considered).
#'
#' Important: don't include variables like ID variables, ZIP codes, dates, etc.
#'
#' @details
#' The asmd may be one indicator as to how much patient characteristics differ between patients with and without an observed
#' value for a partially observed covariate. If the median/average asmd is above a certain threshold this may indicate
#' imbalance in patient covariate distributions which may be indicative of the partially observed covariate following a
#' missing at random (MAR) mechanims, i.e. the missingness is explainable by other observed covariates. Similarly,
#' no imbalance between observed covariates may be indicative that missingness cannot be explained with observed
#' covariates and the underlying missingness mechanism may be completely at random (MCAR) or not at random (e.g.
#' missingness is only associated with unobserved factors or through the partially observed covariate itself).
#'
#' A clear cut-off is hard to determine and analogues to propensity scores,
#' some researchers have proposed that a standardized difference of 0.1 (10 per cent)
#' denotes meaningful imbalance in the baseline covariate.
#'
#' The asmd is computed for every covariate one-by-one and not jointly. If there is multivariate
#' missingness, i.e. more than just one missing covariate exists, you can decide what should
#' happen with the other partially observed 'predictor' covariates using the includeNA parameter.
#' That is, if includeNA is set to FALSE (default), only the asmd between observed cases will be computed,
#' and if includeNA is set to TRUE, missingness is modeled as an explicit category (categorical covariates only).
#'
#' If any other behavior is desired, data transformations for example with the \code{\link{smdi_na_indicator}} function, may make sense
#' before calling the function.
#'
#' The dataframe should generally consist of the exposure variable, the outcome variable(s), the partially observed covariates
#' and all other fully observed covariates which are deemed important for the final modeling
#' and (optionally) which could be considered as auxiliary variables. If no partially observed covariates are provided,
#' the function automatically looks for all variables/columns with NA (powered by the \code{\link{smdi_summarize}} function)
#'
#' @seealso
#' \code{\link[tableone]{CreateTableOne}}
#'
#' @references
#' Austin PC. Balance diagnostics for comparing the distribution of baseline covariates between treatment groups in propensity-score matched samples. Stat Med. 2009 Nov 10;28(25):3083-107.
#'
#' Normand SLT, Landrum MB, Guadagnoli E, Ayanian JZ, Ryan TJ, Cleary PD, McNeil BJ. Validating recommendations for coronary angiography following an acute myocardial infarction in the elderly: a matched analysis using propensity scores. Journal of Clinical Epidemiology. 2001;54:387–398.
#'
#' @param data dataframe or tibble object with partially observed/missing variables
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors
#' @param median logical if the median (= TRUE; recommended default) or mean of all absolute standardized mean differences (asmd) should be computed
#' @param includeNA logical, should missingness of other partially observed covariates be explicitly modeled (default is FALSE)
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems)
#'
#' @return returns an asmd object with average/median absolute standardized mean differences. That is, for each covar, the following outputs are provided:
#'
#' - asmd_covar: name of covariate investigated
#'
#' - asmd_table1: detailed "table 1" illustrating distributions and differences of patient characteristics between those without (1) and with (0) observed covariate
#'
#' - asmd_plot: plot of absolute standardized mean differences (asmd) between patients without (1) and with (0) observed covariate (sorted by asmd)
#'
#' - asmd_aggregate: average/median absolute standardized mean difference (and min, max) of patient characteristics between those without (1) and with (0) observed covariate
#'
#' @importFrom magrittr '%>%'
#' @importFrom dplyr across
#' @importFrom dplyr arrange
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr pull
#' @importFrom dplyr summarize_all
#' @importFrom dplyr where
#' @importFrom fastDummies dummy_cols
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 labs
#' @importFrom ggplot2 scale_color_identity
#' @importFrom ggplot2 theme_bw
#' @importFrom glue glue
#' @importFrom parallel detectCores
#' @importFrom parallel mclapply
#' @importFrom stats median
#' @importFrom tableone CreateTableOne
#' @importFrom tableone ExtractSmd
#' @importFrom tibble tibble
#' @importFrom tibble rownames_to_column
#'
#' @export
#'
#' @examples
#' library(smdi)
#' library(dplyr)
#'
#' # S3 print method
#' asmd <- smdi_asmd(data = smdi_data)
#' asmd
#'
#' # let's look at the first variable
#' # we can check the complete covariate distribution
#' asmd$pdl1_num$asmd_table1
#'

smdi_asmd <- function(data = NULL,
                      covar = NULL,
                      median = TRUE,
                      includeNA = FALSE,
                      n_cores = 1
                      ){


  covariate <- `1 vs 2` <- NULL

  # pre-checks
  if(is.null(data)){stop("No dataframe provided.")}

  # n_cores on windows
  if(Sys.info()[["sysname"]]=="Windows"){
    message("Windows does not support parallelization based on forking. <n_cores> will be set to 1.")
    n_cores = 1
  }

  # more cores than available
  if(n_cores > parallel::detectCores()){
    message("You specified more <n_cores> than you have available. The function will use all cores available to it.")
    }

  # pick missing indicator columns/partially observed covariates
  # check for missing covariates
  covar_miss <- smdi::smdi_check_covar(
    data = data,
    covar = covar
    )

  # start applying smd computation over all partially observed covariates
  smd_loop <- function(i){

    # apply smdi_na_indicator for i to create missing
    # indicator variable
    strata_df <- smdi::smdi_na_indicator(
      data = data,
      covar = i,
      drop_NA_col = TRUE
      )

    # create strata variable
    strata_var <- paste0(i, "_NA")

    # create tableone
    tbl1 <- tableone::CreateTableOne(
        data = strata_df,
        # all covariates except strata covariate
        vars = names(strata_df)[ !names(strata_df) == strata_var],
        # strata covariate
        strata = strata_var,
        # if multiple variables are missing, NA is an own category
        includeNA = includeNA,
        smd = TRUE
        )

    # extract smd and compute median/mean
    smd <- tableone::ExtractSmd(tbl1) %>%
      as.data.frame() %>%
      tibble::rownames_to_column(var = "covariate") %>%
      dplyr::filter(covariate != i) %>%
      dplyr::rename(smd = `1 vs 2`)

    # asmd plot
    asmd_plot <- smd %>%
      ggplot2::ggplot(
        ggplot2::aes(
          y = forcats::fct_reorder(covariate, smd),
          x = smd,
          color = ifelse(smd < 0.1, "blue", "orange")
          )
        ) +
      ggplot2::geom_point(size = 3) +
      ggplot2::labs(
        title = glue::glue("ASMD plot for covariate {i}"),
        x = "Absolute standardized mean difference [ASMD]",
        y = "",
        color = "asmd < 0.1",
        caption = glue::glue("ASMD is computed as the asmd between patients with and without observed '{i}'")
        ) +
      ggplot2::scale_color_identity() +
      ggplot2::theme_bw()

    if(isTRUE(median)){

      asmd_aggregate <- tibble::tibble(
        covariate = paste(i),
        asmd_median = stats::median(smd$smd),
        asmd_min = min(smd$smd),
        asmd_max = max(smd$smd)
        )

    }else{

      asmd_aggregate <- tibble::tibble(
        covariate = paste(i),
        asmd_mean = mean(smd$smd),
        asmd_min = min(smd$smd),
        asmd_max = max(smd$smd)
        )

    }

    # finally, round asmd_aggregate to three decimals
    asmd_aggregate <- asmd_aggregate %>%
      dplyr::mutate(
        dplyr::across(
          dplyr::where(is.numeric),
          ~formatC(.x, format = "f", digits = 3)
          )
        )

    # assemble lapply output object
    return(
      list(
        asmd_covar = i,
        asmd_table1 = print(tbl1, smd = TRUE, printToggle = FALSE),
        asmd_plot = asmd_plot,
        asmd_aggregate = asmd_aggregate
        )
      )

    } # loop ends

  # iterate above analyses over all specified
  # partially observed covariates
  asmd_out <- parallel::mclapply(covar_miss, FUN = smd_loop, mc.cores = n_cores)
  names(asmd_out) <- covar_miss

  class(asmd_out) <- "asmd"

  return(asmd_out)

}

# generic print -----------------------------------------------------------

#' @export
print.asmd <- function(x, ...){

  tbl <- do.call(rbind, lapply(x,'[[',4))

  return(print(tbl))

}

# generic summary ---------------------------------------------------------

#' @export
summary.asmd <- function(object, ...){

  tbl <- do.call(rbind, lapply(object,'[[',4))

  return(tbl)

}