#' Drop correlated columns
#'
#' @param df A dataframe
#' @param thresh
#' @param verbose
#' @export
#' @importFrom tibble as.tibble
#' @importFrom dplyr select
#' @importFrom dplyr select_if
#' @importFrom magrittr %>%
drop_cor <- function(df, thresh = 1, verbose = T){
# perform basic column check on dataframe input
check_df_cols(df)
# filter to only the numeric variables
# df_numeric <- df %>% select_if(is.numeric)
# remove anything that is constant
# df_numeric <- df_numeric %>% select(-which(sapply(df_numeric, var, na.rm = T) == 0))
# if the threshold is 1, can make efficiency by doing a
# pass over a subset
# if(thresh == 1){
# n_quick_pass <- min(nrow(df_numeric), 100)
# set.seed(1)
# cor_quick <- suppressWarnings(df_numeric %>% sample_n(n_quick_pass) %>% inspect_cor())
# cor_quick <- cor_quick %>% filter(abs(corr) == 1 | is.na(corr)) %>% select(col_1, col_2)
# # cut down the size of df_numeric based on quick search
# if(nrow(cor_quick) > 1){
# names_cor <- unique(cor_quick %>% unlist())
# df_numeric <- df_numeric[names_cor]
# } else {
# # if there is nothing 100% correlated even in the quick pass
# return()
# }
# }
# perform a full pass
if(ncol(df_numeric) > 0){
# get full correlation matrix
df_cor <- df_numeric %>% inspect_cor()
# get correlated features to drop
cor_nms <- df_cor %>% filter(abs(corr) > thresh | is.na(corr)) %>% select(col_1, col_2)
# names to drop
if(nrow(cor_quick) > 1){
names_cor <- unique(cor_nms %>% unlist())
}
}
# invisibly return the df for further summaries
invisible(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.