#' Collinearity Diagnostics
#' @description
#' `r badge('stable')`
#'
#' Perform a (multi)collinearity diagnostic of a correlation matrix of predictor
#' variables using several indicators, as shown by Olivoto et al. (2017).
#'
#'
#' @param .data The data to be analyzed. It must be a symmetric correlation
#' matrix, or a data frame, possible with grouped data passed from
#' [dplyr::group_by()].
#' @param ... Variables to use in the correlation. If `...` is null then
#' all the numeric variables from `.data` are used. It must be a single
#' variable name or a comma-separated list of unquoted variables names.
#'@param by One variable (factor) to compute the function by. It is a shortcut
#' to [dplyr::group_by()]. To compute the statistics by more than
#' one grouping variable use that function.
#' @param n If a correlation matrix is provided, then `n` is the number of
#' objects used to compute the correlation coefficients.
#' @return If `.data` is a grouped data passed from [dplyr::group_by()]
#' then the results will be returned into a list-column of data frames.
#'
#' * **cormat** A symmetric Pearson's coefficient correlation matrix
#' between the variables
#'
#' * **corlist** A hypothesis testing for each of the correlation
#' coefficients
#'
#' * **evalevet** The eigenvalues with associated eigenvectors of the
#' correlation matrix
#'
#' * **indicators** A `data.frame` with the following indicators
#' - `VIF` The Variance Inflation Factors, being the diagonal elements of
#' the inverse of the correlation matrix.
#' - `cn` The Condition Number of the correlation matrix, given by the
#' ratio between the largest and smallest eigenvalue.
#' - `det` The determinant of the correlation matrix.
#' - `ncorhigh` Number of correlation greather than |0.8|.
#' - `largest_corr` The largest correlation (in absolute value) observed.
#' - `smallest_corr` The smallest correlation (in absolute value)
#' observed.
#' - `weight_var` The variables with largest eigenvector (largest weight)
#' in the eigenvalue of smallest value, sorted in decreasing order.
#'
#' @md
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#' @references
#' Olivoto, T., V.Q. Souza, M. Nardino, I.R. Carvalho, M. Ferrari, A.J.
#' Pelegrin, V.J. Szareski, and D. Schmidt. 2017. Multicollinearity in path
#' analysis: a simple method to reduce its effects. Agron. J. 109:131-142.
#' \doi{10.2134/agronj2016.04.0196}
#'
#'
#' @export
#' @examples
#'\donttest{
#' # Using the correlation matrix
#' library(metan)
#'
#' cor_iris <- cor(iris[,1:4])
#' n <- nrow(iris)
#'
#' col_diag <- colindiag(cor_iris, n = n)
#'
#'
#' # Using a data frame
#' col_diag_gen <- data_ge2 %>%
#' group_by(GEN) %>%
#' colindiag()
#'
#' # Diagnostic by levels of a factor
#' # For variables with "N" in variable name
#' col_diag_gen <- data_ge2 %>%
#' group_by(GEN) %>%
#' colindiag(contains("N"))
#'}
colindiag <- function(.data,
...,
by = NULL,
n = NULL) {
if (!has_class(.data, c("matrix", "data.frame", "grouped_df", "covcor_design", "tbl_df"))) {
stop("The object 'x' must be a correlation matrix, a data.frame or a grouped data.frame")
}
if (is.matrix(.data) && is.null(n)) {
stop("You have a matrix but the sample size used to compute the correlations (n) was not declared.")
}
if (is.data.frame(.data) && !is.null(n)) {
stop("You cannot informe the sample size because a data frame was used as input.")
}
if (!missing(by)){
if(length(as.list(substitute(by))[-1L]) != 0){
stop("Only one grouping variable can be used in the argument 'by'.\nUse 'group_by()' to pass '.data' grouped by more than one variable.", call. = FALSE)
}
.data <- group_by(.data, {{by}})
}
if(is_grouped_df(.data)){
results <- .data %>%
doo(colindiag,
...,
n = n)
return(results %>% set_class(c("colingroup", "tbl_df", "tbl", "data.frame")))
}
internal <- function(x) {
if (is.matrix(x)) {
cor.x <- x
}
if (is.data.frame(x)) {
cor.x <- cor(x)
n <- nrow(x)
}
eigen <- eigen(cor.x)
Det <- det(cor.x)
NC <- max(eigen$values)/min(eigen$values)
Aval <- data.frame(eigen$values)
names(Aval) <- "Eigenvalues"
Avet <- data.frame(t(eigen$vectors))
names(Avet) <- colnames(x)
AvAvet <- cbind(Aval, Avet)
VIF <- data.frame(diag(solve_svd(cor.x))) |> rownames_to_column("var")
names(VIF)[2] <- "VIF"
results <- data.frame(linear = as.vector(t(cor.x)[lower.tri(cor.x,
diag = F)]))
results <- dplyr::mutate(results, t = linear * (sqrt(n -
2))/(sqrt(1 - linear^2)), prob = 2 * (1 - pt(abs(t),
df = n - 2)))
names <- colnames(x)
combnam <- combn(names, 2, paste, collapse = " x ")
rownames(results) <- names(sapply(combnam, names))
largest_corr <- paste0(rownames(results)[which.max(abs(results$linear))],
" = ", round(results[which.max(abs(results$linear)),
1], 3))
smallest_corr <- paste0(rownames(results)[which.min(abs(results$linear))],
" = ", round(results[which.min(abs(results$linear)),
1], 3))
ncorhigh <- sum(results$linear >= abs(0.8))
ultimo <- data.frame(Peso = t(AvAvet[c(nrow(AvAvet)), ])[-c(1), ])
abs <- data.frame(Peso = abs(ultimo[, "Peso"]))
rownames(abs) <- rownames(ultimo)
ultimo <- abs[order(abs[, "Peso"], decreasing = T), , drop = FALSE]
pesovarname <- paste(rownames(ultimo), collapse = " > ")
indicators <- data.frame(cn = NC,
det = Det,
ncohigh = ncorhigh,
largest_corr = largest_corr,
smallest_corr = smallest_corr,
weight_var = pesovarname)
final <- list(cormat = data.frame(cor.x),
corlist = results,
evalevet = AvAvet,
VIF = VIF,
indicators = indicators)
invisible(final)
}
if (is.matrix(.data)) {
out <- internal(.data)
}
if (is.data.frame(.data)) {
if(!missing(...)){
dfs <- select_cols(.data, ...) %>%
select_numeric_cols()
if(has_na(dfs)){
dfs <- remove_rows_na(dfs)
has_text_in_num(dfs)
}
} else{
dfs <- select_numeric_cols(.data)
if(has_na(dfs)){
dfs <- remove_rows_na(dfs)
has_text_in_num(dfs)
}
}
out <- internal(dfs)
}
invisible(out %>% set_class("colindiag"))
}
#' Print an object of class colindiag
#'
#' Print the `colindiag` object in two ways. By default, the results
#' are shown in the R console. The results can also be exported to the directory
#' into a *.txt file.
#'
#'
#' @param x The object of class `colindiag`
#' @param export A logical argument. If `TRUE`, a *.txt file is exported to
#' the working directory.
#' @param file.name The name of the file if `export = TRUE`
#' @param digits The significant digits to be shown.
#' @param ... Options used by the tibble package to format the output. See
#' [`tibble::print()`][tibble::formatting] for more details.
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#' @method print colindiag
#' @export
#' @examples
#' \donttest{
#' library(metan)
#' col <- colindiag(data_ge2)
#' print(col)
#' }
print.colindiag <- function(x, export = FALSE, file.name = NULL, digits = 3, ...) {
if (export == TRUE) {
file.name <- ifelse(is.null(file.name) == TRUE, "colindiag print", file.name)
sink(paste0(file.name, ".txt"))
}
if(has_class(x, "colingroup")){
for (i in 1:nrow(x)){
df <- x[i,]
names <-
select(df, -data) %>%
concatenate(everything(), pull = TRUE)
cat("Level:", names, "\n")
cat("---------------------------------------------------------------------------\n")
dat <- df[["data"]][[1]]
cn <- dat$indicators$cn
VIF <- dat$VIF$VIF
if (cn > 1000) {
cat(paste0("Severe multicollinearity in the matrix! Pay attention on the variables listed bellow\n",
"cn = ", round(cn, digits), "\n"))
}
if (cn < 100) {
cat(paste0("Weak multicollinearity in the matrix\n",
"cn = ", round(cn, digits), "\n"))
}
if (cn > 100 & cn < 1000) {
cat(paste0("The multicollinearity in the matrix should be investigated.\n",
"cn = ", round(cn, digits), "\n", "Largest VIF = ",
max(VIF), "\n"))
}
cat(paste0("Matrix determinant: ", round(dat$indicators$det, 7)), "\n")
cat(paste0("Largest correlation: ", dat$indicators$largest_corr), "\n")
cat(paste0("Smallest correlation: ", dat$indicators$smallest_corr), "\n")
cat(paste0("Number of VIFs > 10: ", length(which(VIF > 10))), "\n")
cat(paste0("Number of correlations with r >= |0.8|: ",dat$indicators$ncorhigh), "\n")
cat(paste0("Variables with largest weight in the last eigenvalues: \n", dat$indicators$weight_var), "\n")
cat("---------------------------------------------------------------------------\n")
}
} else{
cn <- x$indicators$cn
VIF <- x$VIF$VIF
if (cn > 1000) {
cat(paste0("Severe multicollinearity in the matrix! Pay attention on the variables listed bellow\n",
"cn = ", round(cn, digits), "\n"))
}
if (cn < 100) {
cat(paste0("Weak multicollinearity in the matrix\n",
"cn = ", round(cn, digits), "\n"))
}
if (cn > 100 & cn < 1000) {
cat(paste0("The multicollinearity in the matrix should be investigated.\n",
"cn = ", round(cn, digits), "\n", "Largest VIF = ",
max(VIF), "\n"))
}
cat(paste0("Matrix determinant: ", round(x$indicators$det, 7)), "\n")
cat(paste0("Largest correlation: ", x$indicators$largest_corr), "\n")
cat(paste0("Smallest correlation: ", x$indicators$smallest_corr), "\n")
cat(paste0("Number of VIFs > 10: ", length(which(VIF > 10))), "\n")
cat(paste0("Number of correlations with r >= |0.8|: ",x$indicators$ncorhigh), "\n")
cat(paste0("Variables with largest weight in the last eigenvalues: \n", x$indicators$weight_var), "\n")
}
if (export == TRUE) {
sink()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.