R/relativeimportance.R

Defines functions removeDataThatAreOutliersFromLinearRegression extractFirstRowMatrixToNumeric computeCorrelationImportance jaccardTest computeJaccardImportance singleJaccardExpectation singleJaccardCoefficient computeJaccardCoefficients subsetDataWeightsAndFormula extractRegressionInfo appendStatistics isTStatisticUsed extractVariableStandardErrors extractVariableCoefficients weightedZScores signsWarning extractNumericX estimateRelativeImportance estimateImportance

estimateImportance <- function(formula, data = NULL, weights, type, signs, r.square, variable.names,
                               robust.se = FALSE, outlier.prop.to.remove, show.warnings = TRUE, correction,
                               importance, ...)
{
    if (!is.null(weights))
        robust.se <- FALSE
    if (importance == "Relative Importance Analysis")
        estimateRelativeImportance(formula, data, weights, type, signs, r.square, variable.names,
                                   robust.se, outlier.prop.to.remove, show.warnings, correction, ...)
    else if (importance == "Shapley Regression")
        computeShapleyImportance(formula, data, weights, signs, variable.names, robust.se, outlier.prop.to.remove,
                                 show.warnings, correction, ...)
    else if (importance == "Jaccard Coefficient")
        computeJaccardImportance(formula, data, weights, variable.names, correction, outlier.prop.to.remove)
    else if (importance == "Correlation")
        computeCorrelationImportance(formula, data, weights, variable.names, correction, outlier.prop.to.remove)
    else
        stop("Importance type not handled: ", importance)
}

#' @importFrom stats cov.wt as.formula
#' @importFrom flipData DataFormula
#' @importFrom flipTransformations AsNumeric
#' @importFrom flipU OutcomeName AllVariablesNames
#' @importFrom verbs Sum SumEachRow
#' @noRd
estimateRelativeImportance <- function(formula, data = NULL, weights, type, signs, r.square, variable.names,
                                       robust.se = FALSE, outlier.prop.to.remove = NULL, show.warnings = TRUE,
                                       correction, ...)
{
    # Johnson, J.W. (2000). "A Heuristic Method for Estimating the Relative Weight
    # of Predictor Variables in Multiple Regression"

    if (type == "Multinomial Logit")
        stop("Relative importance analysis is not available for ", type)

    processed.data <- subsetDataWeightsAndFormula(formula, data, weights)
    data <- processed.data$data
    weights <- processed.data$weights
    formula <- processed.data$formula

    info <- extractRegressionInfo(formula, data, weights, type, signs,
                                  r.square, variable.names, robust.se,
                                  outlier.prop.to.remove, ...)
    signs <- info$signs
    r.square <- info$r.square
    variable.names <- info$variable.names

    signsWarning(signs, show.warnings, type)

    num.X <- extractNumericX(formula, data, show.warnings)

    input.weights <- weights
    if (is.null(weights))
        weights <- rep(1, nrow(data))

    result <- if (is.na(info[["non.outlier.n"]])) list()
              else list(non.outlier.n = info[["non.outlier.n"]])

    x.zscore <- sapply(num.X, function(x) weightedZScores(x, weights))

    outcome.name <- OutcomeName(formula, data)
    y <- if (type == "Linear")
    {
        num.y <- AsNumeric(data[[outcome.name]], binary = FALSE)
        weightedZScores(num.y, weights)
    }
    else
        data[[outcome.name]]
    # Protect against ordered factor with empty levels
    if (type == "Ordered Logit")
        y <- Ordered(y)

    corr.x <- cov.wt(num.X, wt = weights, cor = TRUE)$cor
    diag(corr.x) <- 1    # may not be exactly 1 from cov.wt
    eigen.corr.x <- eigen(corr.x)
    delta <- diag(sqrt(eigen.corr.x$values))
    delta_inverse <- diag(1 / sqrt(eigen.corr.x$values))
    lambda <- eigen.corr.x$vectors %*% delta %*% t(eigen.corr.x$vectors) # Lambda = V * Delta * V^T
    lambda_inverse <- eigen.corr.x$vectors %*% delta_inverse %*% t(eigen.corr.x$vectors) # Lambda^-1 = V * Delta^-1 * V^T

    z <- x.zscore %*% lambda_inverse # orthogonal regressors

    reg.data <- cbind(data.frame(y = y), as.data.frame(z))
    data.formula <- as.formula(paste0("y ~ ", paste(paste0("V", 1:ncol(z)), collapse = "+")))

    fit <- if (type == "Linear")
        lm(y ~ 0 + z, weights = weights)
    else
        FitRegression(data.formula, reg.data, input.weights,
                      type, FALSE, outlier.prop.to.remove = 0, seed = 12321, ...)$original
    beta <- extractVariableCoefficients(fit, type, FALSE)
    beta.se <- extractVariableStandardErrors(fit, type, robust.se, FALSE)

    raw.importance <- as.vector(lambda ^ 2 %*% beta ^ 2)

    names(raw.importance) <- variable.names
    if (r.square == 0)
    {
        scaling.factor <- 1
        warning("The R-squared is 0. As a result, the raw scores have not ",
                "been scaled to sum to the R-squared.")
    }
    else
        # R-squared is not always the sum of raw.importance, e.g. with binary logit
        scaling.factor <- r.square / Sum(raw.importance, remove.missing = FALSE)
    raw.importance <- raw.importance * scaling.factor

    se  <- c(sqrt(lambda ^ 4 %*% (beta.se ^ 4 * (2 + 4 * (beta / beta.se) ^ 2)))) * scaling.factor
    names(se) <- variable.names

    appendStatistics(result, raw.importance, se, signs, fit, correction)
}

extractNumericX <- function(formula, data, show.warnings)
{
    formula.names <- AllVariablesNames(formula, data)
    outcome.name <- OutcomeName(formula, data)
    X <- data[setdiff(formula.names, outcome.name)]

    ## We remove the "ordered" class so that ordered-categorical variables are
    ## treated in the same way as they are in regression, i.e., dummy variables
    ## are created from the categories.
    for (j in 1:ncol(X))
        if (all(c("factor", "ordered") %in% class(X[, j])))
            class(X[, j]) <- "factor"

    if (show.warnings && any(factors <- sapply(X, function(x) class(x) == "factor")))
        warning(paste0("The following variables have been treated as categorical: ",
                       paste0(names(X)[factors], collapse = ","),
                       ". This may over-inflate their effects."))

    AsNumeric(X, remove.first = TRUE)
}

signsWarning <- function(signs, show.warnings, type)
{
    if (show.warnings && any(signs < 0))
        warning("Negative signs in Relative Importance scores were applied from coefficient signs in ",
                regressionType(type),
                ". To disable this feature, check the Absolute importance scores option.")
}

#' @importFrom stats weighted.mean
#' @importFrom flipStatistics Correlation
weightedZScores <- function(x, weights)
{
    m <- weighted.mean(x, weights)
    std <- sqrt(Correlation(x, x, weights = weights, correlation = FALSE))
    (x - m) / std
}

# Only extract coefficients for variables, not intercepts
extractVariableCoefficients <- function(model, type, linear.regression.intercept = TRUE)
{
    if (type == "Linear" && !linear.regression.intercept)
        model$coefficients
    else if (type %in% c("Linear", "Binary Logit", "Poisson", "Quasi-Poisson", "NBD"))
        model$coefficients[-1]
    else if (type %in% c("Ordered Logit"))
        model$coefficients
    else
        stop(paste("Type not handled: ", type))
}

extractVariableStandardErrors <- function(model, type, robust.se, linear.regression.intercept = TRUE)
{
    standard.errors <- if (robust.se != FALSE)
        coeftest(model, vcov. = vcov2(model, robust.se))[, 2]
    else
        summary(model)$coefficients[, 2]

    if (type == "Linear" && !linear.regression.intercept)
        standard.errors
    else if (type %in% c("Linear", "Binary Logit", "Poisson", "Quasi-Poisson", "NBD"))
        standard.errors[-1]
    else if (type %in% c("Ordered Logit"))
        standard.errors[-length(standard.errors):-(length(model$coefficients) + 1)]
    else
        stop(paste("Type not handled: ", type))
}

isTStatisticUsed <- function(model)
{
    grepl("^t", colnames(summary(model)$coefficients)[3])
}

appendStatistics <- function(obj, raw.importance, standard.errors, signs, fit, correction)
{
    result <- obj
    result$raw.importance <- raw.importance
    result$standard.errors <- standard.errors
    result$importance <- unname(signs) * 100 * prop.table(raw.importance)
    result$statistics <- unname(signs) * raw.importance / standard.errors
    is.t.statistic.used <- isTStatisticUsed(fit)
    result$statistic.name <- if (is.t.statistic.used) "t" else "z"
    raw.p.values <- if (is.t.statistic.used)
        2 * pt(abs(result$statistics), fit$df.residual, lower.tail = FALSE)
    else
        2 * pnorm(abs(result$statistics), lower.tail = FALSE)
    result$p.values <- pvalAdjust(raw.p.values, correction)
    result
}

extractRegressionInfo <- function(formula, data, weights, type, signs,
                                  r.square, variable.names, robust.se = FALSE,
                                  outlier.prop.to.remove, ...)
{
    non.outlier.n <- NA
    if (is.null(signs) || any(is.na(signs)) || is.null(r.square) || is.na(r.square))
    {
        formula2 <- DataFormula(formula, data)
        fit <- FitRegression(formula2, data, weights, type, robust.se, outlier.prop.to.remove, ...)
        if (is.null(signs) || any(is.na(signs)))
        {
            signs <- sign(round(extractVariableCoefficients(fit$original, type), 13))
            signs[signs == 0] <- 1 # set sign 0 to 1
        }
        if (is.null(r.square) || is.na(r.square))
            r.square <- GoodnessOfFit(fit$original)$value

        if (all(is.na(variable.names)))
        {
            tmp.names <- CleanBackticks(names(fit$original$coefficients))
            variable.names <- if (type == "Ordered Logit") tmp.names else tmp.names[-1]
        }
        if (!is.null(outlier.prop.to.remove) && outlier.prop.to.remove > 0)
            non.outlier.n <- sum(fit[["non.outlier.data"]])
    }
    list(signs = signs, r.square = r.square,
         variable.names = variable.names, non.outlier.n = non.outlier.n)
}

#' Function to find the correct subset omitting outliers, returns
#' the appropriate subsetted data, updated formula (incase dot is used)
#' @param formula The formula object to construct the regression
#' @param data The estimation data used in the regression construction
#' @param weights The optional numeric vector of non-negative weights
#' @param outlier.prop.to.remove Single numeric value denoting the proportion of observations removed in
#'  the automated outlier removal.
#' @importFrom stats terms.formula update.formula
#' @importFrom flipU OutcomeName
#' @importFrom flipU CopyAttributes
#' @noRd
subsetDataWeightsAndFormula <- function(formula, data, weights)
{
    # If necessary, filter the data to the outlier adjusted subset
    if ("non.outlier.data_GQ9KqD7YOf" %in% names(data))
    {
        data.indices <- data[["non.outlier.data_GQ9KqD7YOf"]]
        data <- CopyAttributes(data[data.indices, -which(names(data) == "non.outlier.data_GQ9KqD7YOf")], data)
        if (!is.null(weights))
            weights <- weights[data.indices]
    }

    # Protect against dot in formula with the non.outlier indicator variable
    formula.terms <- terms.formula(formula, data = data)
    predictor.names <- attr(formula.terms, "term.labels")
    non.outlier.in.data.frame <- predictor.names == "non.outlier.data_GQ9KqD7YOf"
    if (any(non.outlier.in.data.frame))
    {
        formula <- update.formula(formula, drop.terms(formula.terms,
                                                      which(non.outlier.in.data.frame),
                                                      keep.response = TRUE))
        predictor.names <- predictor.names[!non.outlier.in.data.frame]
    }
    # subset data (removes interaction term from data and only extracts terms from formula)
    outcome.name <- OutcomeName(formula, data)
    data <- data[, c(outcome.name, predictor.names)]
    return(list(formula = formula,
                data = data,
                weights = weights))
}

#' Computes the Jaccard Coefficients for the outcome variable against all the predictors
#' used in the regression and returns those coefficients along with their relative importance
#' and the outcome variable itself along with the data.frame of predictors
#' @param formula The formula used in the Regression model
#' @param data The data to compute the calculation on
#' @param weights A numeric vector of weights
#' @param variable.names Vector of names of the coefficients in the regression model. Defaults to NA for
#'   comptability with crosstab interaction code that can predictor data could be excluded when filtered.
#' @importFrom flipU OutcomeName
#' @importFrom stats terms.formula
#' @noRd
computeJaccardCoefficients <- function(formula, data = NULL, weights, variable.names = NA,
                                       outlier.prop.to.remove = 0)
{
    processed.data <- subsetDataWeightsAndFormula(formula, data, weights)
    relevant.data <- processed.data$data
    weights <- processed.data$weights
    formula <- processed.data$formula

    if (!"non.outlier.data_GQ9KqD7YOf" %in% names(data) &&
        !is.null(outlier.prop.to.remove) && outlier.prop.to.remove > 0)
        relevant.data <- removeDataThatAreOutliersFromLinearRegression(formula, relevant.data, weights,
                                                                       outlier.prop.to.remove = outlier.prop.to.remove)
    outcome.name <- OutcomeName(formula, relevant.data)
    outcome.variable <- relevant.data[[outcome.name]]
    predictor.names <- attr(terms.formula(formula, data = relevant.data), "term.labels")
    predictor.variables <- relevant.data[, predictor.names, drop = FALSE]
    if (!any(is.na(variable.names)))
        names(predictor.variables) <- variable.names
    else
        variable.names <- names(predictor.variables)
    jaccard.coefficients <- vapply(predictor.variables, singleJaccardCoefficient,
                                   numeric(1), y = outcome.variable, weights = weights)
    names(jaccard.coefficients) <- variable.names
    list(outcome.variable = outcome.variable,
         coefficients = jaccard.coefficients,
         predictor.variables = predictor.variables,
         weights = weights,
         non.outlier.n = nrow(relevant.data))
}

#' Compute the single Jaccard coefficient between x and y
#' @param x The binary input variable x
#' @param y The binary input variable y
#' @param centered A logical to determine if the centered coefficient is returned.
#'   If \code{TRUE}, then the estimate of the mean is subtracted off the jaccard coefficient.
#'   @importFrom verbs Sum
#' @noRd
singleJaccardCoefficient <- function(x, y, centered = FALSE, weights = NULL) {
    if (is.null(weights))
        j <- Sum(y & x)/Sum(x | y)
    else
        j <- Sum(weights[y & x])/Sum(weights[x | y])
    if (centered)
        j <- j - singleJaccardExpectation(x, y)
    j
}

# Computes the expected value of the Jaccard coefficient between binary variables x and y
singleJaccardExpectation <- function(x, y)
{
    px <- mean(x, na.rm = TRUE)
    py <- mean(y, na.rm = TRUE)
    px * py/(px + py - px * py)
}

#' @param formula The formula used in the Regression model
#' @param data The data to compute the calculation on
#' @param weights A numeric vector of weights
#' @param variable.names Vector of names of the coefficients in the regression model. Defaults to NA for
#'   comptability with crosstab interaction code that can predictor data could be excluded when filtered.
#' @param correction A character specifying the multiple comparisons correction to be applied.
#' @importFrom stats terms.formula
#' @noRd
computeJaccardImportance <- function(formula, data = NULL, weights, variable.names = NA,
                                     correction = "None", outlier.prop.to.remove = 0)
{
    jaccard.coef.output <- computeJaccardCoefficients(formula, data, weights,
                                                      variable.names, outlier.prop.to.remove)
    # Extract the outcome binary variable, the data.frame of predictor binary variables
    # the computed jaccard coefficients and the relative importance and possible weights
    y <- jaccard.coef.output$outcome.variable
    X <- jaccard.coef.output$predictor.variables
    jaccard.coefs <- jaccard.coef.output$coefficients
    weights <- if(is.null(jaccard.coef.output$weights)) rep(1, length(y)) else CalibrateWeight(weights)
    test.output <- lapply(X, jaccardTest, y = y, weights = weights)
    test.statistics <- vapply(test.output, "[[", numeric(1), "t")
    relative.importance <- 100 * prop.table(abs(test.statistics))
    pvalues <- vapply(test.output, "[[", numeric(1), "p.value")
    pvalues <- pvalAdjust(pvalues, correction)
    standard.errors <- vapply(test.output, "[[", numeric(1), "standard.error")
    names(pvalues) <- names(standard.errors) <- variable.names
    sample.size <- vapply(X, function(x) sum(!is.na(x) & !is.na(y)), numeric(1))
    names(sample.size) <- variable.names
    list(importance = relative.importance,
         raw.importance = jaccard.coefs,
         standard.errors = standard.errors,
         statistics = test.statistics,
         sample.size = sample.size,
         p.values = pvalues,
         non.outlier.n = jaccard.coef.output[["non.outlier.n"]])
}

#' Compute the single p-value using reasmpled binary variables x and y
#' @param x A numeric vector of the binary predictor variable
#' @param y A numeric vector of the binary outcome variable
#' @param weights A numeric vector of non-negative weight values
#' @importFrom survey svydesign svyratio degf
#' @importFrom verbs Sum
#' @noRd
jaccardTest <- function(x, y, weights)
{
    design <- svydesign(ids = ~ 1,
                        data = data.frame(numerator = as.integer(y & x),
                                          denominator = as.integer(y | x)),
                        weights = ~ weights)
    ratio.estimation <- svyratio(~ numerator, ~ denominator, design, na.rm = TRUE)
    jaccard <- as.numeric(ratio.estimation$ratio)
    df <- degf(design)
    p <- mean(x, na.rm = TRUE)
    q <- mean(y, na.rm = TRUE)
    expected.jaccard <- p * q / (p + q - p * q)
    # Catch case when estimated Jaccard coef == 1 (vectors identical)
    if (jaccard != 0 && jaccard != 1)
        standard.error <- sqrt(as.numeric(ratio.estimation$var))
    else
    { # Compute the second order Taylor expansion estimate of the variance
        weight.factor <- Sum(weights^2, remove.missing = FALSE)/Sum(weights, remove.missing = FALSE)^2
        multiplier <- (p + q - 2 * p * q)/(p * q * (p + q - p * q))
        standard.error <- expected.jaccard* sqrt(weight.factor * multiplier)
    }
    t <- (jaccard - expected.jaccard) / standard.error
    p <- 2 * pt(-abs(t), df)
    list(jaccard = jaccard,
         expected = expected.jaccard,
         df = df,
         standard.error = standard.error,
         t = t,
         p.value = p)
}

#' @param x The formula used in the Regression model
#' @param data The data to compute the calculation on
#' @param weights A numeric vector of weights
#' @param variable.names Vector of names of the coefficients in the regression model.
#' @param correction A character specifying the multiple comparisons correction to be applied.
#' @importFrom flipU OutcomeName
#' @importFrom stats terms.formula
#' @importFrom flipStatistics CorrelationsWithSignificance
#' @importFrom flipTransformations AsDataFrame
#' @noRd
computeCorrelationImportance <- function(formula, data = NULL, weights, variable.names,
                                         correction, outlier.prop.to.remove = 0)
{
    processed.data <- subsetDataWeightsAndFormula(formula, data, weights)
    relevant.data <- AsDataFrame(processed.data$data, use.names = TRUE, categorical.as.binary = FALSE)
    names(relevant.data) <- names(processed.data$data)
    weights <- processed.data$weights
    formula <- processed.data$formula

    if (!"non.outlier.data_GQ9KqD7YOf" %in% names(data) &&
        !is.null(outlier.prop.to.remove) && outlier.prop.to.remove > 0)
        relevant.data <- removeDataThatAreOutliersFromLinearRegression(formula, relevant.data, weights,
                                                                       outlier.prop.to.remove = outlier.prop.to.remove)

    outcome.name <- OutcomeName(formula, relevant.data)
    outcome.variable <- relevant.data[[outcome.name]]
    predictor.names <- attr(terms.formula(formula, data = relevant.data), "term.labels")
    predictor.variables <- relevant.data[, predictor.names, drop = FALSE]
    if (!any(is.na(variable.names)))
        colnames(relevant.data) <- c(outcome.name, variable.names)
    else
        variable.names <- colnames(predictor.variables)

    weights <- if (is.null(weights)) rep(1, nrow(relevant.data)) else weights
    correlation.output <- CorrelationsWithSignificance(relevant.data, weights)
    indices <- match(variable.names, colnames(correlation.output$cor), nomatch = 0)

    correlation.coefs <- extractFirstRowMatrixToNumeric(correlation.output$cor, indices)
    relative.importance <- 100 * prop.table(abs(correlation.coefs))
    statistics <- extractFirstRowMatrixToNumeric(correlation.output$t, indices)
    std.errs <- extractFirstRowMatrixToNumeric(correlation.output$standard.errors, indices)
    pvalues <- extractFirstRowMatrixToNumeric(correlation.output$p, indices)
    pvalues <- pvalAdjust(pvalues, correction)
    sample.size <- vapply(relevant.data[indices], function(x, y) sum(!is.na(x) & !is.na(y)), numeric(1), y = outcome.variable)
    list(importance = relative.importance,
         raw.importance = correlation.coefs,
         statistics = statistics,
         standard.errors = std.errs,
         sample.size = sample.size,
         p.values = pvalues,
         non.outlier.n = nrow(relevant.data))
}

# Extracts the first row of a numeric matrix, the columns selected are
# governed by the indices argument.
extractFirstRowMatrixToNumeric <- function(mat, indices)
{
    x <- mat[1, indices, drop = FALSE]
    names.x <- colnames(x)
    x <- as.numeric(x)
    names(x) <- names.x
    x
}

# Removes proportion of outliers from a fitted Linear Regression model and returns
# the data not considered to be outliers.
removeDataThatAreOutliersFromLinearRegression <- function(formula, data, weights, outlier.prop.to.remove)
{
    linear.model.outliers.removed <- FitRegression(.formula = formula, .estimation.data = data,
                                                   .weights = weights,  outlier.prop.to.remove = outlier.prop.to.remove,
                                                   robust.se = FALSE, # Not required as standard errors not used
                                                   type = "Linear")
    non.outlier.data <- linear.model.outliers.removed[["non.outlier.data"]]
    relevant.data <- linear.model.outliers.removed[[".estimation.data"]][non.outlier.data, ]
    relevant.data[["non.outlier.data_GQ9KqD7YOf"]] <- NULL
    relevant.data
}
Displayr/flipRegression documentation built on March 2, 2024, 3:51 a.m.