R/drop_cor.R

Defines functions drop_cor

Documented in drop_cor

#' Drop correlated columns
#'
#' @param df A dataframe
#' @param thresh
#' @param verbose
#' @export
#' @importFrom tibble as.tibble
#' @importFrom dplyr select
#' @importFrom dplyr select_if
#' @importFrom magrittr %>%

drop_cor <- function(df, thresh = 1, verbose = T){
  # perform basic column check on dataframe input
  check_df_cols(df)
  
  # filter to only the numeric variables
  # df_numeric <- df %>% select_if(is.numeric)
  # remove anything that is constant
  # df_numeric <- df_numeric %>% select(-which(sapply(df_numeric, var, na.rm = T) == 0))
  
  # if the threshold is 1, can make efficiency by doing a 
  # pass over a subset
  # if(thresh == 1){
  #   n_quick_pass <- min(nrow(df_numeric), 100)
  #   set.seed(1)
  #   cor_quick <- suppressWarnings(df_numeric %>% sample_n(n_quick_pass) %>% inspect_cor())
  #   cor_quick <- cor_quick %>% filter(abs(corr) == 1 | is.na(corr)) %>% select(col_1, col_2)
  #   # cut down the size of df_numeric based on quick search
  #   if(nrow(cor_quick) > 1){
  #     names_cor <- unique(cor_quick %>% unlist())
  #     df_numeric <- df_numeric[names_cor]
  #   } else {
  #     # if there is nothing 100% correlated even in the quick pass
  #     return()
  #   }
  # }
  # perform a full pass
  if(ncol(df_numeric) > 0){
    # get full correlation matrix
    df_cor <- df_numeric %>% inspect_cor()
    # get correlated features to drop
    cor_nms <- df_cor %>% filter(abs(corr) > thresh | is.na(corr)) %>% select(col_1, col_2)
    # names to drop
    if(nrow(cor_quick) > 1){
      names_cor <- unique(cor_nms %>% unlist())
    }
  }

  
  # invisibly return the df for further summaries
  invisible(df)
}
alastairrushworth/mlblitz documentation built on Nov. 1, 2019, 9:06 p.m.