R/utils_get_data.R
In insight: Easy Access to Model Information for Various Model Objects

# Function that does the most work for preparing and transforming the data,
# to ensure we have a "clean" data frame from the data that was used to fit
# the model. This also means that, unless necessary for further processing,
# variables transformed during model fitting are not included in this data frame
.prepare_get_data <- function(x, mf, effects = "fixed", verbose = TRUE) {
  # check if we have any data yet
  if (is_empty_object(mf)) {
    if (isTRUE(verbose)) {
      format_warning("Could not get model data.")
    }
    return(NULL)
  }

  # we may store model weights here later
  mw <- NULL


  # make sure it's a data frame -----------------------------------------------

  if (!is.data.frame(mf)) {
    mf <- .safe(as.data.frame(mf))
    if (is.null(mf)) {
      if (isTRUE(verbose)) {
        format_warning(
          "Cannot coerce data into a data frame.",
          "No data will be returned."
        )
      }
      return(NULL)
    }
  }


  # offset variables ----------------------------------------------------------

  # do we have an offset, not specified in the formula?

  # This is a bit slower - restore if tests fail...
  # offcol <- grep("^(\\(offset\\)|offset\\((.*)\\))", colnames(mf))

  offcol <- grepl("(offset)", colnames(mf), fixed = TRUE) | grepl("offset(", colnames(mf), fixed = TRUE)
  model_call <- get_call(x)
  if (length(offcol) && object_has_names(model_call, "offset")) {
    colnames(mf)[offcol] <- clean_names(safe_deparse(model_call$offset))
  }


  # backtransform variables, such as log, sqrt etc ----------------------------

  mf <- .backtransform(mf, x)


  # clean 1-dimensional matrices ---------------------------------------------

  # in particular, transformation like "scale()" may produce a 1D-matrix,
  # where we want a vector instead
  mf[] <- lapply(mf, function(.x) {
    if (is.matrix(.x) && dim(.x)[2] == 1 && !inherits(.x, c("ns", "bs", "poly", "mSpline"))) {
      as.vector(.x)
    } else {
      .x
    }
  })


  # detect matrix columns ----------------------------------------------------

  # check if we have any matrix columns, e.g. from splines
  mc <- vapply(mf, is.matrix, TRUE)

  # save original response value and the respective single variable names of
  # the response for later. we don't want to change the response value,
  # if it's a matrix bound with "cbind()"
  rn <- find_response(x, combine = TRUE)
  rn_not_combined <- find_response(x, combine = FALSE)

  # make sure rn is not NULL, but empty string
  if (is.null(rn)) rn <- ""
  if (is.null(rn_not_combined)) rn_not_combined <- ""

  trials.data <- NULL


  # restore original variables used in matrix-response columns ----------------

  if (mc[1] && rn == colnames(mf)[1]) {
    mc[1] <- FALSE
    if (inherits(x, c("coxph", "flexsurvreg", "coxme", "survreg", "survfit", "crq", "psm", "coxr"))) {
      n_of_responses <- ncol(mf[[1]])
      mf <- cbind(as.data.frame(as.matrix(mf[[1]])), mf)
      colnames(mf)[1:n_of_responses] <- rn_not_combined[1:n_of_responses]
    } else {
      tryCatch(
        {
          trials.data <- as.data.frame(mf[[1]])
          colnames(trials.data) <- rn_not_combined

          # if columns were bound via subtraction, e.g.
          # "cbind(success, total - success)", we need to sum up success and
          # total for the original total-column.

          pattern <- sprintf("%s(\\s*)-(\\s*)%s", rn_not_combined[2], rn_not_combined[1])
          if (any(grepl(pattern = pattern, x = rn))) {
            trials.data[[2]] <- trials.data[[1]] + trials.data[[2]]
          }
        },
        error = function(x) {
          NULL
        }
      )
    }
  }


  # process matrix-variables (restore original data from matrix variables) ----

  # if we have any matrix columns, we remove them from original
  # model frame and convert them to regular data frames, give
  # proper column names and bind them back to the original model frame
  if (any(mc)) {
    # try to get model data from environment
    md <- .safe(eval(model_call$data, environment(stats::formula(x))))

    # in case we have missing data, the data frame in the environment
    # has more rows than the model frame
    if (isTRUE(!is.null(md) && nrow(md) != nrow(mf))) {
      md <- .safe({
        md <- .recover_data_from_environment(x)
        md <- stats::na.omit(md[intersect(
          colnames(md),
          find_variables(x, effects = "all", component = "all", flatten = TRUE)
        )])
      })
    }

    # if data not found in environment,
    # reduce matrix variables into regular vectors
    if (is.null(md)) {
      # we select the non-matrix variables and convert matrix-variables into
      # regular data frames, then binding them together
      mf_matrix <- mf[, which(mc), drop = FALSE]
      mf_nonmatrix <- mf[, -which(mc), drop = FALSE]

      # fix for rms::rcs() functions
      if (any(inherits(mf_matrix[[1]], "rms"))) {
        class(mf_matrix[[1]]) <- "matrix"
      }

      # matrix to data frame, bind to model frame
      mf_list <- lapply(mf_matrix, as.data.frame, stringsAsFactors = FALSE)
      mf_matrix <- do.call(cbind, mf_list)
      mf <- cbind(mf_nonmatrix, mf_matrix)
    } else {
      # fix NA in column names
      if (anyNA(colnames(md))) {
        colnames(md) <- make.names(colnames(md))
      }

      # get "matrix" terms and "normal" predictors,
      # but exclude response variable(s)
      mf_matrix <- mf[, -which(mc), drop = FALSE]
      spline.term <- clean_names(names(which(mc)))
      other.terms <- clean_names(colnames(mf_matrix))[-1]

      # now we have all variable names that we need
      # from the original data set
      needed.vars <- c(other.terms, spline.term)

      # if response is a matrix vector (e.g. multivariate response),
      # we need to include all response names as well, because else
      # rows may not match due to additional missings in the response variables
      if (is.matrix(mf[[1]])) {
        needed.vars <- c(dimnames(mf[[1]])[[2]], needed.vars)
      } else {
        needed.vars <- c(colnames(mf)[1], needed.vars)
      }

      # check model weights
      if ("(weights)" %in% needed.vars && !object_has_names(md, "(weights)")) {
        needed.vars <- needed.vars[-which(needed.vars == "(weights)")]
        mw <- mf[["(weights)"]]
        fw <- find_weights(x)
        if (!is.null(fw) && all(fw %in% colnames(md))) {
          needed.vars <- c(needed.vars, fw)
        }
      }

      if (inherits(x, c("coxph", "coxme", "coxr")) || any(startsWith(spline.term, "Surv("))) {
        # no further processing for survival models
        mf <- md
      } else {
        # get cleaned variable names for those variables
        # that we still need from the original model frame
        needed.vars <- compact_character(unique(clean_names(needed.vars)))

        # check if data in environment was modified in between?
        if (!all(needed.vars %in% colnames(md))) {
          needed.vars <- intersect(needed.vars, colnames(md))
        }

        # if data recovered from environment does not match current
        # model frame, tell user and skip this step
        if (!length(needed.vars) || nrow(md) != nrow(mf)) {
          if (isTRUE(verbose)) {
            format_warning(
              "Could not find all model variables in the data.",
              "Maybe the original data frame used to fit the model was modified?"
            )
          }
        } else {
          mf <- md[, needed.vars, drop = FALSE]

          # we need this hack to save variable and value label attributes, if any
          value_labels <- lapply(mf, attr, which = "labels", exact = TRUE)
          variable_labels <- lapply(mf, attr, which = "label", exact = TRUE)

          # removing NAs drops all label-attributes
          mf <- stats::na.omit(mf)

          # then set back attributes
          mf <- as.data.frame(Map(function(.d, .l) {
            attr(.d, "labels") <- .l
            .d
          }, mf, value_labels), stringsAsFactors = FALSE)

          mf <- as.data.frame(Map(function(.d, .l) {
            attr(.d, "label") <- .l
            .d
          }, mf, variable_labels), stringsAsFactors = FALSE)
        }
      }

      # add back model weights, if any
      if (!is.null(mw)) mf$`(weights)` <- mw
    }

    # check if we really have all formula terms in our model frame now
    pv <- .safe(find_predictors(x, effects = effects, flatten = TRUE, verbose = verbose))

    # still some undetected matrix-variables?
    if (!is.null(pv) && !all(pv %in% colnames(mf)) && isTRUE(verbose)) {
      format_warning(
        "Some model terms could not be found in model data.",
        "You probably need to load the data into the environment."
      )
    }
  }


  # monotonic predictors ------------------------------------------------------

  # check if we have monotonic variables, included in formula
  # with "mo()"? If yes, remove from model frame
  if (inherits(x, "brmsfit")) {
    mos_eisly <- grepl("^mo\\(([^,)]*).*", colnames(mf))
    if (any(mos_eisly)) {
      mf <- mf[!mos_eisly]
    }
  }


  # strata-variables in coxph() -----------------------------------------------

  strata_columns <- grepl("strata(", colnames(mf), fixed = TRUE)
  if (any(strata_columns)) {
    for (sc in colnames(mf)[strata_columns]) {
      strata_variable <- gsub("strata\\((.*)\\)", "\\1", sc)
      levels(mf[[sc]]) <- gsub(paste0(strata_variable, "="), "", levels(mf[[sc]]), fixed = TRUE)
    }
  }


  # restore original data for factors and logicals -----------------------------

  # are there any factor variables that have been coerced "on-the-fly",
  # using "factor()" or "as.factor()"? if so, get names and convert back
  # to numeric later
  factors <- colnames(mf)[grepl("^(as\\.factor|as_factor|factor|as\\.ordered|ordered)\\((.*)\\)", colnames(mf))]
  logicals <- colnames(mf)[grepl("^(as\\.logical|as_logical|logical)\\((.*)\\)", colnames(mf))]

  ## TODO check! some lines above (see "mos_eisly"). monotonic terms are removed from the model frame

  # check for monotonic terms and valid values. In case 'mo()' is used,
  # and predictor is numeric, prettyfied values in the data grid are based
  # on the range of the numeric variable, although only those values are allowed
  # in the data grid that actually appear in the data
  if (inherits(x, "brmsfit")) {
    model_terms <- find_terms(x, flatten = TRUE)
    monotonics <- grepl("mo\\((.*)\\)", model_terms)
    if (any(monotonics)) {
      factors <- union(factors, gsub("mo\\((.*)\\)", "\\1", model_terms[monotonics]))
    }
  }

  # clean variable names
  cvn <- .remove_pattern_from_names(colnames(mf), ignore_lag = TRUE)


  # check for interaction pattern ----------------------------------------

  ints <- grepl("interaction(", colnames(mf), fixed = TRUE)
  interactions <- NULL
  # add names of 2nd interaction term
  if (any(ints)) {
    if (inherits(x, "brmsfit")) {
      # brms saves original variables, so remove interaction column here
      mf[ints] <- NULL
      cvn <- cvn[!ints]
    } else {
      interactions <- stats::setNames(cvn[ints], trim_ws(gsub("interaction\\((.*),(.*)\\)", "\\2", colnames(mf)[ints])))
      factors <- unique(c(factors, interactions, names(interactions)))
    }
  }


  # as-is variables I() -------------------------------------------------------

  # keep "as is" variable for response variables in data frame
  if (colnames(mf)[1] == rn[1] && grepl("I(", rn[1], fixed = TRUE, ignore.case = FALSE)) {
    md <- .safe({
      tmp <- .recover_data_from_environment(x)[, unique(c(rn_not_combined, cvn)), drop = FALSE]
      tmp[, rn_not_combined, drop = FALSE]
    })

    if (!is.null(md)) {
      mf <- cbind(mf, md)
      cvn <- .remove_pattern_from_names(colnames(mf), ignore_lag = TRUE)
      cvn[1] <- rn[1]
    }
  }


  # fix duplicated colnames ---------------------------------------------------

  # do we have duplicated names?
  dupes <- which(duplicated(cvn))
  if (!.is_empty_string(dupes)) cvn[dupes] <- sprintf("%s.%s", cvn[dupes], seq_along(dupes))

  colnames(mf) <- cvn


  # add weighting variable ----------------------------------------------------

  weighting_var <- find_weights(x)
  if (!is.null(weighting_var) && !all(weighting_var %in% colnames(mf))) {
    mf <- tryCatch(
      {
        tmp <- suppressWarnings(cbind(mf, get_weights(x)))
        colnames(tmp)[ncol(tmp)] <- weighting_var
        tmp
      },
      error = function(e) {
        mf
      }
    )
  }


  # add back possible trials-data ---------------------------------------------

  if (!is.null(trials.data)) {
    new.cols <- setdiff(colnames(trials.data), colnames(mf))
    if (!.is_empty_string(new.cols)) mf <- cbind(mf, trials.data[, new.cols, drop = FALSE])
  }

  # remove "trial response"
  # see https://github.com/easystats/modelbased/issues/164
  # if (rn == colnames(mf)[1] && is.matrix(mf[[1]])) {
  #   mf[[1]] <- NULL
  # }

  .add_remaining_missing_variables(
    x,
    mf,
    effects,
    component = "all",
    factors = factors,
    logicals = logicals,
    interactions = interactions,
    verbose = verbose
  )
}






# add remainng variables with special pattern -------------------------------

.add_remaining_missing_variables <- function(model,
                                             mf,
                                             effects,
                                             component,
                                             factors = NULL,
                                             logicals = NULL,
                                             interactions = NULL,
                                             verbose = TRUE) {
  # check if data argument was used
  model_call <- get_call(model)
  if (is.null(model_call)) {
    data_arg <- NULL
  } else {
    data_arg <- .safe(parse(text = safe_deparse(model_call))[[1]]$data)
  }

  # do we have variable names like "mtcars$mpg"?
  if (is.null(data_arg) && all(grepl("$", colnames(mf), fixed = TRUE))) {
    colnames(mf) <- gsub("(.*)\\$(.*)", "\\2", colnames(mf))
  }

  predictors <- find_predictors(
    model,
    effects = effects,
    component = component,
    flatten = TRUE,
    verbose = FALSE
  )

  # include subset variables
  subset_vars <- .safe(all.vars(model_call$subset))
  missing_vars <- unique(c(setdiff(predictors, colnames(mf)), subset_vars))

  # check if missing variables can be recovered from the environment,
  # and if so, add to model frame.
  if (!is.null(missing_vars) && length(missing_vars) > 0) {
    env_data <- .recover_data_from_environment(model)
    if (!is.null(env_data) && all(missing_vars %in% colnames(env_data))) {
      shared_columns <- intersect(colnames(env_data), c(missing_vars, colnames(mf)))
      env_data <- stats::na.omit(env_data[shared_columns])
      if (nrow(env_data) == nrow(mf) && !any(missing_vars %in% colnames(mf))) {
        mf <- cbind(mf, env_data[missing_vars])
      }
    }
  }

  # fix interaction terms
  if (!is.null(interactions)) {
    full_data <- .safe(.recover_data_from_environment(model))
    if (!is.null(full_data) && nrow(full_data) == nrow(mf)) {
      mf[c(interactions, names(interactions))] <- NULL
      mf <- cbind(mf, full_data[c(interactions, names(interactions))])
    } else {
      for (i in seq_along(interactions)) {
        int <- interactions[i]
        mf[[names(int)]] <- as.factor(substr(
          as.character(mf[[int]]),
          regexpr("\\.[^\\.]*$", as.character(mf[[int]])) + 1,
          nchar(as.character(mf[[int]]))
        ))
        mf[[int]] <- as.factor(substr(
          as.character(mf[[int]]),
          0,
          regexpr("\\.[^\\.]*$", as.character(mf[[int]])) - 1
        ))
      }
      if (isTRUE(verbose)) {
        format_warning(
          "The data contains variables used 'interaction()'-functions. These are probably not recovered accurately in the returned data frame.",
          "Please check the data frame carefully."
        )
      }
    }
  }

  # add attributes for those that were factors
  if (length(factors)) {
    factors <- gsub("^(as\\.factor|as_factor|factor|as\\.ordered|ordered)\\((.*)\\)", "\\2", factors)
    for (i in factors) {
      if (.is_numeric_character(mf[[i]])) {
        mf[[i]] <- .to_numeric(mf[[i]])
      }
      if (is.numeric(mf[[i]])) {
        attr(mf[[i]], "factor") <- TRUE
      }
    }
    attr(mf, "factors") <- factors
  }

  # add attributes for those that were logicals
  if (length(logicals)) {
    logicals <- gsub("^(as\\.logical|as_logical|logical)\\((.*)\\)", "\\2", logicals)
    for (i in logicals) {
      mf[[i]] <- as.numeric(mf[[i]])
      attr(mf[[i]], "logical") <- TRUE
    }
    attr(mf, "logicals") <- logicals
  }

  # add attribute that subset is used
  attr(mf, "is_subset") <- !is.null(subset_vars) && length(subset_vars)

  mf
}






# combine data from different model components -------------------------------

# This helper functions ensures that data from different model components
# are included in the returned data frame
#
.return_combined_data <- function(x, mf, effects, component, model.terms, is_mv = FALSE, verbose = TRUE) {
  response <- unlist(model.terms$response, use.names = FALSE)

  # save factors attribute
  factors <- attr(mf, "factors", exact = TRUE)

  if (is_mv) {
    fixed.component.data <- switch(component,
      all = c(
        sapply(model.terms[-1], function(i) i$conditional),
        sapply(model.terms[-1], function(i) i$zero_inflated),
        sapply(model.terms[-1], function(i) i$dispersion)
      ),
      conditional = sapply(model.terms[-1], function(i) i$conditional),
      zi = ,
      zero_inflated = sapply(model.terms[-1], function(i) i$zero_inflated),
      dispersion = sapply(model.terms[-1], function(i) i$dispersion)
    )

    random.component.data <- switch(component,
      all = c(
        sapply(model.terms[-1], function(i) i$random),
        sapply(model.terms[-1], function(i) i$zero_inflated_random)
      ),
      conditional = sapply(model.terms[-1], function(i) i$random),
      zi = ,
      zero_inflated = sapply(model.terms[-1], function(i) i$zero_inflated_random)
    )

    fixed.component.data <- unlist(fixed.component.data, use.names = FALSE)
    random.component.data <- unlist(random.component.data, use.names = FALSE)
  } else {
    all_elements <- intersect(names(model.terms), .get_elements("fixed", "all"))
    fixed.component.data <- switch(component,
      all = unlist(model.terms[all_elements], use.names = FALSE),
      conditional = model.terms$conditional,
      zi = ,
      zero_inflated = model.terms$zero_inflated,
      dispersion = model.terms$dispersion
    )

    random.component.data <- switch(component,
      all = c(model.terms$random, model.terms$zero_inflated_random),
      conditional = model.terms$random,
      zi = ,
      zero_inflated = model.terms$zero_inflated_random
    )
  }


  # this is to remove the "1" from intercept-ony-models

  if (!is_empty_object(fixed.component.data)) {
    fixed.component.data <- .remove_values(fixed.component.data, c("1", "0"))
    fixed.component.data <- .remove_values(fixed.component.data, c(1, 0))
  }
  if (!is_empty_object(random.component.data)) {
    random.component.data <- .remove_values(random.component.data, c("1", "0"))
    random.component.data <- .remove_values(random.component.data, c(1, 0))
  }

  model_weights <- find_weights(x)
  # if (!is.null(model_weights) && "(weights)" %in% colnames(mf)) {
  #   model_weights <- c(model_weights, "(weights)")
  # }

  vars <- switch(effects,
    all = unique(c(response, fixed.component.data, random.component.data, model_weights)),
    fixed = unique(c(response, fixed.component.data, model_weights)),
    random = unique(random.component.data)
  )

  # add offset
  vars <- c(vars, find_offset(x))

  still_missing <- setdiff(vars, colnames(mf))
  vars <- intersect(vars, colnames(mf))
  dat <- mf[, vars, drop = FALSE]

  if (is_empty_object(dat)) {
    if (isTRUE(verbose)) {
      format_warning(
        sprintf("Data frame is empty, probably component `%s` does not exist in the %s-part of the model?", component, effects)
      )
    }
    return(NULL)
  }

  if (length(still_missing) && isTRUE(verbose)) {
    format_warning(
      sprintf("Following potential variables could not be found in the data: %s", toString(still_missing))
    )
  }

  if ("(offset)" %in% colnames(mf) && !("(offset)" %in% colnames(dat))) {
    dat <- cbind(dat, mf[["(offset"]])
  }

  attr(dat, "factors") <- factors
  dat
}





# find zi-data -----------------------------------

# this function tries to get the data from variables from the zero-inflated
# component and adds them to the model frame. Useful if the zi-component
# has other variables than the count component.
#
.add_zeroinf_data <- function(x, mf, tn) {
  tryCatch(
    {
      env_data <- .recover_data_from_environment(x)[, tn, drop = FALSE]
      .merge_dataframes(env_data, mf, replace = TRUE)
    },
    error = function(x) {
      mf
    }
  )
}




# special model handling -----------------------------------

.get_zelig_relogit_frame <- function(x) {
  vars <- find_variables(x, flatten = TRUE, verbose = FALSE)
  x$data[, vars, drop = FALSE]
}




# combine data from count and zi-component -----------------------------------

.return_zeroinf_data <- function(x, component, verbose = TRUE) {
  model.terms <- find_variables(x, effects = "all", component = "all", flatten = FALSE, verbose = FALSE)
  model.terms$offset <- find_offset(x)

  mf <- .safe(stats::model.frame(x))
  mf <- .prepare_get_data(x, mf, verbose = verbose)
  # add variables from other model components
  mf <- .add_zeroinf_data(x, mf, model.terms$zero_inflated)

  fixed.data <- switch(component,
    all = c(model.terms$conditional, model.terms$zero_inflated, model.terms$offset),
    conditional = c(model.terms$conditional, model.terms$offset),
    zi = ,
    zero_inflated = model.terms$zero_inflated
  )

  mf[, unique(c(model.terms$response, fixed.data, find_weights(x))), drop = FALSE]
}




# "clean" model frame and get data -----------------------------------

# here we have a model frame with many variables, so just extract the important ones...
#
.get_data_from_modelframe <- function(x, dat, effects, verbose = TRUE) {
  if (is_empty_object(dat)) {
    format_warning("Could not get model data.")
    return(NULL)
  }
  cn <- clean_names(colnames(dat))

  ft <- switch(effects,
    fixed = find_variables(x, effects = "fixed", flatten = TRUE),
    all = find_variables(x, flatten = TRUE),
    random = find_random(x, split_nested = TRUE, flatten = TRUE)
  )
  remain <- intersect(c(ft, find_weights(x), find_offset(x)), cn)

  mf <- .safe(dat[, remain, drop = FALSE], dat)
  .prepare_get_data(x, mf, effects, verbose = verbose)
}




# find start vector of nlmer-models from the environment -----------------------------------

# return data from a data frame that is in the environment,
# and subset the data, if necessary
.get_startvector_from_env <- function(x) {
  .safe({
    sv <- eval(parse(text = safe_deparse(x@call))[[1]]$start)
    if (is.list(sv)) sv <- sv[["nlpars"]]
    names(sv)
  })
}





# backtransform variables -------------------------------

.backtransform <- function(mf, x) {
  tryCatch(
    {
      patterns <- c(
        "scale\\(log", "exp\\(scale", "log\\(log", "log", "log1p",
        "log10", "log2", "sqrt", "exp", "expm1", "scale", "cos", "sin",
        "tan", "acos", "asin", "atan"
      )

      # to save time, create a full regex and see if we have any
      # transformation at all, instead of always checking for each pattern
      full_pattern <- sprintf("%s\\(([^,)]*).*", paste0("(", paste0(patterns, collapse = "|"), ")"))

      # only backtransform if we have any matches
      if (any(grepl(full_pattern, colnames(mf)))) {
        for (i in patterns) {
          mf <- .backtransform_helper(mf, i, x)
        }
      }
      mf
    },
    error = function(e) {
      mf
    }
  )
}


.backtransform_helper <- function(mf, type, model) {
  cn <- .get_transformed_names(colnames(mf), type)
  if (!.is_empty_string(cn)) {
    for (i in cn) {
      if (type == "scale\\(log") { # nolint
        mf[[i]] <- exp(.unscale(mf[[i]]))
      } else if (type == "exp\\(scale") {
        mf[[i]] <- .unscale(log(mf[[i]]))
      } else if (type == "log\\(log") {
        mf[[i]] <- exp(exp(mf[[i]]))
      } else if (type == "log") {
        # exceptions: log(x+1) or log(1+x)
        plus_minus <- NULL
        # no plus-minus?
        if (grepl("log\\((.*)\\+(.*)\\)", i)) {
          # 1. try: log(x + number)
          plus_minus <- .safe(eval(parse(text = gsub("log\\(([^,\\+)]+)(.*)\\)", "\\2", i))))
          # 2. try: log(number + x)
          if (is.null(plus_minus)) {
            plus_minus <- .safe(eval(parse(text = gsub("log\\(([^,\\+)]+)(.*)\\)", "\\1", i))))
          }
        }
        if (is.null(plus_minus)) {
          mf[[i]] <- exp(mf[[i]])
        } else {
          mf[[i]] <- exp(mf[[i]]) - plus_minus
        }
      } else if (type == "log1p") {
        mf[[i]] <- expm1(mf[[i]])
      } else if (type == "log10") {
        mf[[i]] <- 10^(mf[[i]])
      } else if (type == "log2") {
        mf[[i]] <- 2^(mf[[i]])
      } else if (type == "sqrt") {
        mf[[i]] <- (mf[[i]])^2
      } else if (type == "exp") {
        mf[[i]] <- log(mf[[i]])
      } else if (type == "expm1") {
        mf[[i]] <- log1p(mf[[i]])
      } else if (type == "scale") {
        mf[[i]] <- .unscale(mf[[i]])
      } else if (type %in% c("cos", "sin", "tan", "acos", "asin", "atan")) {
        mf[[i]] <- .recover_data_from_environment(model)[[i]]
      }
      colnames(mf)[colnames(mf) == i] <- .get_transformed_terms(i, type)
    }
  }
  mf
}



.unscale <- function(x) {
  m <- attr(x, "scaled:center")
  s <- attr(x, "scaled:scale")

  if (is.null(m)) m <- 0
  if (is.null(s)) s <- 1

  s * x + m
}



# find transformed terms, to convert back as raw data --------------------------------

# Find transformed terms inside model formula, and return "clean" term names
.get_transformed_terms <- function(model, type = "all") {
  if (is.character(model)) {
    x <- model
  } else {
    x <- find_terms(model, flatten = TRUE)
  }
  pattern <- sprintf("%s\\(([^,\\+)]*).*", type)
  out <- trim_ws(gsub(pattern, "\\1", grep(pattern, x, value = TRUE)))
  # validation check - when we have something like "log(1+x)" instead "log(x+1)",
  # the regex pattern returns "1" instead of "x3"
  if (!is.na(suppressWarnings(as.numeric(out)))) {
    out <- trim_ws(gsub(pattern, "\\2", grep(pattern, x, value = TRUE)))
  }
  out
}


# get column names of transformed terms
.get_transformed_names <- function(x, type = "all") {
  pattern <- sprintf("%s\\(([^,)]*).*", type)
  grep(pattern, x, value = TRUE)
}


.retrieve_htest_data <- function(x) {
  out <- tryCatch(
    {
      # special handling of survey-objects
      if (startsWith(x$data.name, "svy")) {
        if (grepl("pearson's x^2", tolower(x$method), fixed = TRUE)) {
          d <- x$observed
        } else {
          d <- NULL
        }
      } else {
        # split by "and" and "by". E.g., for t.test(1:3, c(1,1:3)), we have
        # x$data.name = "1:3 and c(1, 1:3)"
        data_name <- trim_ws(unlist(strsplit(x$data.name, "(and|by)"), use.names = FALSE))

        # now we may have exceptions, e.g. for friedman.test(wb$x, wb$w, wb$t)
        # x$data.name is "wb$x, wb$w and wb$t" and we now have "wb$x, wb$w" and
        # "wb$t", so we need to split at comma as well. However, the above t-test
        # example returns "1:3" and "c(1, 1:3)", so we only must split at comma
        # when it is not inside parentheses.
        data_comma <- unlist(strsplit(data_name, "(\\([^)]*\\))"), use.names = FALSE)

        # any comma not inside parentheses?
        if (any(grepl(",", data_comma, fixed = TRUE))) {
          data_name <- trim_ws(unlist(strsplit(data_comma, ", ", fixed = TRUE)))
        }

        # exeception: list for kruskal-wallis
        if (grepl("Kruskal-Wallis", x$method, fixed = TRUE) &&
          (length(data_name) == 1 && startsWith(data_name, "list("))) {
          l <- eval(str2lang(x$data.name))
          names(l) <- paste0("x", seq_along(l))
          return(l)
        }

        data_call <- lapply(data_name, str2lang)
        columns <- lapply(data_call, eval)

        # detect which kind of tests we have -----------------

        # McNemar ============================================================

        if (!grepl(" (and|by) ", x$data.name) && !grepl(x$method, "Paired t-test", fixed = TRUE) &&
          !startsWith(x$method, "Wilcoxon") &&
          (startsWith(x$method, "McNemar") || (length(columns) == 1 && is.matrix(columns[[1]])))) {
          # McNemar: preserve table data for McNemar ----
          return(as.table(columns[[1]]))
        } else if (startsWith(x$method, "Kruskal-Wallis") && length(columns) == 1 && is.list(columns[[1]])) {
          # Kruskal-Wallis: check if data is a list for kruskal-wallis ----
          l <- columns[[1]]
          names(l) <- paste0("x", seq_along(l))
          return(l)
        } else if (grepl("t-test", x$method, fixed = TRUE)) {
          # t-Test: (Welch) Two Sample t-test ----
          if (grepl("Two", x$method, fixed = TRUE)) {
            if (grepl(" and ", x$data.name, fixed = TRUE)) {
              return(.htest_reshape_long(columns))
            } else if (grepl(" by ", x$data.name, fixed = TRUE)) {
              return(.htest_no_reshape(columns))
            }

            # t-Test: Paired t-test
          } else if (startsWith(x$method, "Paired")) {
            if (grepl(" and ", x$data.name, fixed = TRUE)) {
              # t-Test: Paired t-test, two vectors ----
              return(.htest_reshape_long(columns))
            } else if (grepl(" by ", x$data.name, fixed = TRUE)) {
              # t-Test: Paired t-test, formula (no reshape required) ----
              return(.htest_no_reshape(columns))
            } else if (startsWith(x$data.name, "Pair(")) {
              # t-Test: Paired t-test ----
              return(.htest_reshape_matrix(columns))
            }

            # t-Test: One Sample
          } else {
            d <- .htest_other_format(columns)
          }
        } else if (startsWith(x$method, "Wilcoxon rank sum")) {
          # Wilcoxon ========================================================
          if (grepl(" by ", x$data.name, fixed = TRUE)) {
            # Wilcoxon: Paired Wilcoxon, formula (no reshape required) ----
            return(.htest_no_reshape(columns))
          } else {
            return(.htest_reshape_long(columns))
          }
        } else if (startsWith(x$method, "Wilcoxon signed rank")) {
          if (startsWith(x$data.name, "Pair(")) {
            return(.htest_reshape_matrix(columns))
          } else if (grepl(" and ", x$data.name, fixed = TRUE)) {
            # Wilcoxon: Paired Wilcoxon, two vectors ----
            return(.htest_reshape_long(columns))
          } else if (grepl(" by ", x$data.name, fixed = TRUE)) {
            # Wilcoxon: Paired Wilcoxon, formula (no reshape required) ----
            return(.htest_no_reshape(columns))
          } else {
            # Wilcoxon: One sample ----
            d <- .htest_other_format(columns)
          }
        } else {
          # Other htests ======================================================
          d <- .htest_other_format(columns)
        }

        # fix column names
        if (all(grepl("(.*)\\$(.*)", data_name)) && length(data_name) == length(colnames(d))) {
          colnames(d) <- gsub("(.*)\\$(.*)", "\\2", data_name)
        } else if (ncol(d) > 2L) {
          colnames(d) <- paste0("x", seq_len(ncol(d)))
        } else if (ncol(d) == 2) {
          colnames(d) <- c("x", "y")
        } else {
          colnames(d) <- "x"
        }
      }

      d
    },
    error = function(e) {
      NULL
    }
  )

  # 2nd try
  if (is.null(out)) {
    for (parent_level in 1:5) {
      out <- .safe({
        data_name <- trim_ws(unlist(strsplit(x$data.name, "(and|,|by)"), use.names = FALSE))
        as.table(get(data_name, envir = parent.frame(n = parent_level)))
      })
      if (!is.null(out)) break
    }
  }

  out
}


# reshape helpers -------------------

.htest_reshape_long <- function(columns) {
  data.frame(
    x = unlist(columns),
    y = c(
      rep("1", length(columns[[1]])),
      rep("2", length(columns[[2]]))
    ),
    stringsAsFactors = TRUE
  )
}

.htest_no_reshape <- function(columns) {
  data.frame(
    x = columns[[1]],
    y = as.factor(columns[[2]])
  )
}


.htest_reshape_matrix <- function(columns) {
  data.frame(
    x = c(columns[[1]][, 1, drop = TRUE], columns[[1]][, 2, drop = TRUE]),
    y = c(
      rep("1", nrow(columns[[1]])),
      rep("2", nrow(columns[[1]]))
    ),
    stringsAsFactors = TRUE
  )
}


.htest_other_format <- function(columns) {
  max_len <- max(lengths(columns))
  for (i in seq_along(columns)) {
    if (length(columns[[i]]) < max_len) {
      columns[[i]] <- c(columns[[i]], rep(NA, max_len - length(columns[[i]])))
    }
  }
  as.data.frame(columns)
}
Any scripts or data that you put into this service are public.
insight documentation built on June 22, 2024, 9:14 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
insight
Easy Access to Model Information for Various Model Objects

R/utils_get_data.R
In insight: Easy Access to Model Information for Various Model Objects

Defines functions .htest_other_format .htest_reshape_matrix .htest_no_reshape .htest_reshape_long .prepare_get_data

Try the insight package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

insight Easy Access to Model Information for Various Model Objects

R/utils_get_data.R In insight: Easy Access to Model Information for Various Model Objects

Defines functions .htest_other_format .htest_reshape_matrix .htest_no_reshape .htest_reshape_long .prepare_get_data

Try the insight package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

insight
Easy Access to Model Information for Various Model Objects

R/utils_get_data.R
In insight: Easy Access to Model Information for Various Model Objects