R/acc_varcomp.R
In dataquieR: Data Quality in Epidemiological Research

Documented in acc_varcomp

#' Estimates variance components
#'
#' @description
#' Variance based models and intraclass correlations (ICC) are approaches to
#' examine the impact of so-called process variables on the measurements. This
#' implementation is model-based.
#'
#' **NB:** The term ICC is frequently used to describe the agreement between
#' different observers, examiners or even devices. In respective settings a good
#' agreement is pursued. ICC-values can vary between `[-1;1]` and an ICC close
#' to 1 is desired (Koo and Li 2016, Müller and Büttner 1994).
#'
#' However, in multi-level analysis the ICC is interpreted differently. Please
#' see Snijders et al. (Sniders and Bosker 1999). In this context the proportion
#' of variance explained by respective group levels indicate an influence of (at
#' least one) level of the respective group_vars. An ICC close to 0 is desired.
#'
#' [Indicator]
#'
#' @details
#' # ALGORITHM OF THIS IMPLEMENTATION:
#'
#' - This implementation is yet restricted to data of type float.
#' - Missing codes are removed from resp_vars (if defined in the metadata)
#' - Deviations from limits, as defined in the metadata, are removed
#' - A linear mixed-effects model is estimated for resp_vars using co_vars and
#'   group_vars for adjustment.
#' - An output data frame is generated for group_vars indicating the ICC.
#'
#' @export
#' @importFrom stats as.formula na.omit median
#'
#' @param resp_vars [variable list] the names of the continuous measurement
#'                                  variables
#' @param group_vars [variable list] the names of the resp. observer, device or
#'                                   reader variables
#' @param co_vars [variable list] a vector of covariables, e.g. age and sex for
#'                                adjustment
#' @param min_obs_in_subgroup [integer] from=0. optional argument if a
#'                                   "group_var" is used. This argument
#'                                   specifies the minimum no. of observations
#'                                   that is required to include a subgroup
#'                                   (level) of the "group_var" in the analysis.
#'                                   Subgroups with fewer observations are
#'                                   excluded. The default is 30.
#' @param min_subgroups [integer] from=0. optional argument if a "group_var" is
#'                                        used. This argument specifies the
#'                                        minimum no. of subgroups (levels)
#'                                        included "group_var". If the variable
#'                                        defined in "group_var" has fewer
#'                                        subgroups it is not used for analysis.
#'                                        The default is 5.
#' @param threshold_value [numeric] from=0 to=1. a numerical value ranging
#'                                              from 0-1
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#'                               attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#'                                       with labels of variables
#'
#' @return a list with:
#'   - `SummaryTable`: data frame with ICCs per `rvs`
#'   - `SummaryData`: data frame with ICCs per `rvs`
#'   - `ScalarValue_max_icc`: maximum variance contribution value by group_vars
#'   - `ScalarValue_argmax_icc`: variable with maximum variance contribution by
#'                               group_vars
#'
#' @seealso
#' [Online Documentation](
#' https://dataquality.qihs.uni-greifswald.de/VIN_acc_impl_varcomp.html
#' )
#'
#' @examples
#' \dontrun{
#' # runs spuriously slow on rhub
#' load(system.file("extdata/study_data.RData", package = "dataquieR"))
#' load(system.file("extdata/meta_data.RData", package = "dataquieR"))
#' co_vars <- c("SEX_0", "AGE_0")
#' min_obs_in_subgroup <- 30
#' min_subgroups <- 3
#' label_col <- LABEL
#' rvs <- c("DBP_0", "SBP_0")
#' group_vars <- prep_map_labels(rvs, meta_data = meta_data, from = label_col,
#'   to = VAR_NAMES)
#' group_vars <- prep_map_labels(group_vars, meta_data = meta_data,
#'   to = GROUP_VAR_OBSERVER)
#' group_vars <- prep_map_labels(group_vars, meta_data = meta_data)
#' acc_varcomp(
#'   resp_vars = rvs, group_vars = group_vars, co_vars = co_vars,
#'   min_obs_in_subgroup = min_obs_in_subgroup,
#'   min_subgroups = min_subgroups, label_col = label_col,
#'   study_data = study_data, meta_data = meta_data
#' )
#' }
#'
acc_varcomp <-
  function(resp_vars = NULL, group_vars, co_vars = NULL,
           min_obs_in_subgroup = 30, min_subgroups = 5, label_col = NULL,
           threshold_value = 0.05, study_data, meta_data) { # TODO: does this function remove observers w/o observations?

  # preps ----------------------------------------------------------------------
  # map metadata to study data
  prep_prepare_dataframes(.replace_hard_limits = TRUE)

  .min_obs_in_subgroup <- suppressWarnings(as.integer(min_obs_in_subgroup))
  if (is.na(.min_obs_in_subgroup)) {
    util_message(c(
    "Could not convert min_obs_in_subgroup %s to a number.",
    "Set to standard value."
    ),
     dQuote(as.character(min_obs_in_subgroup)),
    applicability_problem = TRUE
   )
    min_obs_in_subgroup <- 30
  } else {
    min_obs_in_subgroup <- .min_obs_in_subgroup
  }

  .min_subgroups <- suppressWarnings(as.integer(min_subgroups))
  if (is.na(.min_subgroups)) {
    util_message(
      "Could not convert min_subgroups %s to a number. Set to standard value.",
      dQuote(as.character(min_subgroups)),
      applicability_problem = TRUE
    )
    min_subgroups <- 5
  } else {
    min_subgroups <- .min_subgroups
  }

  # correct variable use?
  util_correct_variable_use("resp_vars",
    allow_more_than_one = TRUE,
    allow_null = TRUE,
    need_type = "integer|float",
    need_scale = "interval|ratio"
  )

  util_correct_variable_use("group_vars",
    allow_null = TRUE,
    allow_any_obs_na = TRUE,
    allow_more_than_one = !TRUE, # TODO: Remove the whole functionality.
    need_type = "!float",
    need_scale = "nominal|ordinal"
  )

  util_correct_variable_use("co_vars",
    allow_na = TRUE,
    allow_more_than_one = TRUE,
    allow_null = TRUE
  )

  rvs <- resp_vars # TODO: Fix rvs <-> resp_vars

  if (length(rvs) == 0) {
    rvs <- colnames(ds1[, vapply(ds1, is.numeric, FUN.VALUE = logical(1))])
    rvs <- setdiff(rvs, group_vars)
    rvs <- setdiff(rvs, co_vars)
    if (length(group_vars) != 1)
      util_error(
        "Need exactly 1 group_vars, if all applicable resp_vars are used",
        applicability_problem = TRUE)
  }

  if (length(group_vars) == 1 && length(rvs) > 1) {
    group_vars <- rep(group_vars, length(rvs))
    if (!.called_in_pipeline)
      util_message(sprintf("using the same group var %s for all resp_vars",
                    dQuote(group_vars[[1]])
                    ))
  }

  if (length(group_vars) != length(rvs)) {
    util_error(
      c("acc_varcomp expects one group_var per resp_var. Here,",
        "it has been called with %d resp_vars but %d group_vars"),
      length(rvs), length(group_vars),
      applicability_problem = TRUE)
  }

  ds1[, unique(group_vars)] <- lapply(ds1[, unique(group_vars), drop = FALSE],
                                      factor)

  # (3) if no covariables are defined for adjustment only the intercept is
  #     modelled
  if (is.null(co_vars) || length(co_vars) == 0) {
    co_vars <- "1"
    cv <- character(0)
  } else {
    cv <- co_vars
    co_vars <- util_bQuote(co_vars)
  }

  # missing checks
  # -> encoded missings or jumps
  # -> resp_variables
  # -> variable types (factor(), ...)


  icc_output <- apply(cbind(rvs, group_vars), 1, function(row) {
    rv <- row[[1]]
    group_var <- row[[2]]

    check_df <- util_table_of_vct(ds1[[group_var]])

    # too few observations in >1 level of the random effect
    critical_levels <- levels(check_df$Var1)[check_df$Freq <
                                               min_obs_in_subgroup]
    if (length(critical_levels) > 0) {
      util_message("Levels %s were excluded due to less than %d observations.",
                   paste0(c(vapply(head(critical_levels, 10), dQuote, ""),
                            if (length(critical_levels) <= 10)
                              character(0) else "..."),
                          collapse = ", "),
                   min_obs_in_subgroup,
                   applicability_problem = FALSE)
      # exclude levels with too few observations
      ds1 <- ds1[!(ds1[[group_var]] %in% critical_levels), ]
      # dropping unused levels
      ds1[[group_var]] <- factor(ds1[[group_var]])
    }

    check_df <-
      util_table_of_vct(ds1[[group_var]]) # number of observers may have changed

    # fewer than min_subgroups levels of random effect
    if (length(check_df[, 1]) < min_subgroups) {
      util_message("%d < %d levels in %s. Will not compute ICCs for %s.",
                   length(check_df[, 1]),
                   min_subgroups,
                   dQuote(group_var),
                   dQuote(rv),
                   applicability_problem = TRUE,
                   intrinsic_applicability_problem = TRUE)
      return(data.frame(
        Variables = NA,
        Object = NA,
        Model.Call = NA,
        ICC = NA,
        Class.Number = NA,
        Mean.Class.Size = NA,
        Median.Class.Size = NA,
        Min.Class.Size = NA,
        Max.Class.Size = NA,
        convergence.problem = FALSE
      )[FALSE, , FALSE])
    }

    ds1[, rv] <- lapply(ds1[, rv, drop = FALSE], util_as_numeric)

    # prepare data frame for output
    res_df <- data.frame(
      Variables = NA,
      Object = NA,
      Model.Call = NA,
      ICC = NA,
      Class.Number = nrow(check_df),
      Mean.Class.Size = mean(check_df$Freq),
      Median.Class.Size = median(check_df$Freq),
      Min.Class.Size = min(check_df$Freq),
      Max.Class.Size = max(check_df$Freq),
      convergence.problem = FALSE
    )

    # fill output df
    res_df[1, 1] <- rv
    res_df[1, 2] <- group_var
    # create model formula from function arguments
    fmla <- as.formula(paste0(paste0(util_bQuote(rv), "~"),
                              paste0(c(co_vars, paste0("(1|", util_bQuote(group_var), ")")),
                                     collapse = "+")))
    # save formula for results
    res_df[1, 3] <- Reduce(paste, deparse(fmla))

    # remove misisng in used variables
    subdf <- na.omit(ds1[, c(rv, intersect(cv, colnames(ds1)),
                             group_var), drop = FALSE])
    # drop unused levels (https://stackoverflow.com/a/1197154)
    subdf[] <- lapply(subdf, function(x) if (is.factor(x)) factor(x) else x)

    env <- environment()

    convergence.problem <- FALSE

    suppressMessages(withCallingHandlers(
      # fit mixed effects model with formula
      fit_lmer <- try(lme4::lmer(fmla, data = subdf, REML = TRUE)),
      message = function(m) {
        if (inherits(m, "simpleMessage") && any(
            grepl("singular",
             trimws(conditionMessage(m))))) {
          env$convergence.problem <- TRUE
        } else {
          util_message(m) # nocov
          # This is to handle other possible problems with the model
          # Since it is for unexpected problems, I cannot write a test
        }
      },
      warning = function(w) {
        # nocov start
        # on travis, the try above prevents a stop on convergence problems
        # for some unclear reasons. therefore, on travis,
        # the following code cannot be reached. However, locally, this
        # is possible.
        haystack <- trimws(paste0(conditionMessage(w), collapse = "\n"))
        if (inherits(w, "simpleWarning") && any(vapply(
            c("unable to evaluate scaled gradient",
              "Problem with Hessian check",
              "Model is nearly unidentifiable: very large eigenvalue",
              "convergence code [0-9\\-].+ from nloptwrap",
              "Model failed to converge with"
              ), grepl, haystack, perl = TRUE, FUN.VALUE = logical(1)))) {
          env$convergence.problem <- TRUE
          invokeRestart("muffleWarning")
        } else {
          warning(w)
          # This is to handle other possible problems with the model
          # Since it is for unexpected problems, I cannot write a test
        }
        # nocov end
      }
    ))

    res_df[1, "convergence.problem"] <- convergence.problem

    if (inherits(fit_lmer, "try-error")) {
      res_df[1, "convergence.problem"] <- TRUE
    } else {
      # extract variance of random effects
      v_tab <- as.data.frame(lme4::VarCorr(fit_lmer))
      res_df$Class.Number[[1]] <- lme4::ngrps(fit_lmer)
      # calculate icc
      res_df[1, 4] <- round(v_tab$vcov[1] / (v_tab$vcov[1] + v_tab$vcov[2]), 3)
    }

    class_sizes <- util_table_of_vct(subdf[, group_var])
    res_df <- within(res_df, {
      Mean.Class.Size[[1]] <- mean(class_sizes$Freq)
      Median.Class.Size[[1]] <- median(class_sizes$Freq)
      Min.Class.Size[[1]] <- min(class_sizes$Freq)
      Max.Class.Size[[1]] <- max(class_sizes$Freq)
    })

    # output
    return(res_df)
  })

  icc_output <- as.data.frame(do.call(dplyr::bind_rows, icc_output),
                              stringsAsFactors = FALSE)
  rownames(icc_output) <- NULL

  max_icc <- suppressWarnings(max(icc_output$ICC, na.rm = TRUE))
  argmax_icc <- icc_output$Variables[which.max(icc_output$ICC)]

  icc_output[["GRADING"]] <- as.numeric(threshold_value <= icc_output$ICC)

   ## Format

  rownames(icc_output) <- NULL

  SummaryTable<- icc_output
  names(SummaryTable)[names(SummaryTable) == "ICC"] <- "ICC_acc_ud_loc"


  return(list(SummaryTable = SummaryTable, SummaryData = icc_output,  ScalarValue_max_icc = max_icc,
              ScalarValue_argmax_icc = argmax_icc))
}