Nothing
#' Detect if any column of a data.frame is a duplicate of another
#'
#' It occasionally happens that 2 (or more) columns in dataframe are exactly identical.
#' This could lead to redundant computational cost and unexpected behavior in Machine Learning methods.
#' This function scans though all column combinations of dataframe to examine if any 2 columns are exactly identical.
#'
#' @param dataset A data.frame
#' @param return_type How to return detected duplicate columns
#' Use "col_names", "col_positions" or "dataset" to return dataset with deleted duplicate columns
#' @param duplicate_col If 2 columns are identical, which of the 2 columns should be treated as duplicate?
#' Use "right" for right column, "left" for left.
#' @return A vector of duplicate column names or column positions or dataset with deleted duplicate columns. Use return_type parameter to specify.
#' @export
#' @importFrom magrittr %>%
#' @importFrom dplyr select all_of any_of last_col
#' @examples
#' \dontrun{
#' detect_dupl_cols(dataset = head(mutate(mtcars, mpg_2 = mpg)), duplicate_col = "right")
#' }
detect_dupl_cols <- function(dataset, return_type = "col_names", duplicate_col = "right"){
# Check/fix inputs
if("matrix" %in% class(dataset) == T){
warning("matrix was provided when data.frame was expected. Attempting to convert to data.frame")
dataset = dataset %>% as.data.frame
}
if("data.frame" %in% class(dataset) == F)
stop("class of dataset must be a data.frame")
if(return_type %in% c("col_names", "col_positions", "dataset") == F)
stop('return_type must be "col_names", "col_positions" or "dataset". Or simply use default value')
if(duplicate_col %in% c("left", "right") == F)
stop('duplicate_col must be "left" or "left". Or simply use default value')
dataset = dataset %>% as.data.frame() # forcing data.frame like objects to data.frame only
# A vector to collect names of duplicated columns
duplicate_cols = NULL
for(col_1 in dataset %>% select(1:last_col(1)) %>% names()){
# Don't check for duplicate of already detected duplicate columns, since its duplicates would have been detected already
if(col_1 %in% duplicate_cols){
next
}
# Check col_1 is duplicated with any of the columns to its right
for(col_2 in dataset %>% select(all_of(col_1):last_col()) %>% names()){
# Don't check for duplicate of already detected duplicate columns, since its duplicates would have been detected already
if(col_2 %in% duplicate_cols){
next
}
# Don't treat a column as a duplicate of itself, obviously!
if(col_1 == col_2){
next
}
# If col_1 and col_2 are exactly identical, collect column name.
if(all(dataset[,col_1] == dataset[,col_2])){
if(duplicate_col == "right"){
duplicate_cols = c(duplicate_cols, col_2)
}else if(duplicate_col == "left"){
duplicate_cols = c(duplicate_cols, col_1)
}
}
} # col_2 for loop ends here
} # col_1 for loop ends here
if(return_type == "col_names"){
if(is.null(duplicate_cols)){
message("No duplicate columns found. Returning NULL.")
return(NULL)
}
return(duplicate_cols)
}else if(return_type == "col_positions"){
if(is.null(duplicate_cols)){
message("No duplicate columns found. Returning NULL.")
return(NULL)
}
return(which(names(dataset) %in% duplicate_cols))
}else if(return_type == "dataset"){
if(is.null(duplicate_cols)){
message("No duplicate columns found. Returning original dataset.")
return(dataset)
}
return(dataset %>% select(-any_of(duplicate_cols)))
}
} # function ends here
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.