#' Automated data cleaning
#'
#' Cleans features in a dataset for machine learning purposes. Utilizes the edaFrame generated by exploreData. Cleaning involves imputation, clipping outliers and creating tracking features
#'
#' @param x [data.frame | Required] Data.frame containing numeric features to transform
#' @param feats [character vector | Required] Character vector of features to clean
#' @param edaFrame [data.frame | Required] Data.frame object returned by exploreData function
#' @param trackingFeats [logical | Optional] Should tracking features be created. Tracking features are binary features that keep track of data before changes have been applied to, useful for tree type models
#' @param clipOutliers [logical | Optional] Should outliers be clipped using the method specified in the exploreData function
#' @param imputeMissing [logical | Optional] Should features be imputed using median imputation for numerics and mode for categoricals
#' @param progress [logical | Optional] Display progress
#' @param autoCode [logical | Optional] Should code be generated when running the function
#'
#' @return List containing data.frame with cleaned features as well as code when autoCode is TRUE
#' @export
#'
#' @examples
#' eda <- exploreData(iris)
#' cleaned <- cleanFeatures(x = iris, feats = names(iris), edaFrame = eda)
#' @author
#' Xander Horn
cleanFeatures <- function(x, feats, edaFrame, trackingFeats = TRUE, clipOutliers = TRUE, imputeMissing = TRUE,
progress = FALSE, autoCode = TRUE){
code <- list()
if(progress == TRUE){
pb <- txtProgressBar(min = 0, max = length(feats), style = 3)
}
for(i in 1:length(feats)){
tempEDA <- as.data.frame(subset(edaFrame, edaFrame$Feature == feats[i]))
selected <- as.character(tempEDA$Feature)
if(tempEDA$Class %in% c("integer","numeric")){
if(clipOutliers == TRUE & tempEDA$Type != "Indicator"){
if(trackingFeats == TRUE){
feat <- paste0("XEC_Outlier_",feats[i])
x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 0,
ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), 1, 0))
x[,selected] <- ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), as.numeric(tempEDA$Median), x[,selected])
code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 0,
ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", 1, 0))")
code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", ",as.numeric(tempEDA$Median),", x[,'",selected,"'])")
} else {
x[,selected] <- ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), as.numeric(tempEDA$Median), x[,selected])
code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", ",as.numeric(tempEDA$Median),", x[,'",selected,"'])")
}
}
if(imputeMissing == TRUE){
if(trackingFeats == TRUE){
feat <- paste0("XEC_Miss_",feats[i])
x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 1, 0)
x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.numeric(tempEDA$ImputationVal), x[,selected])
code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 1, 0)")
code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, ",as.numeric(tempEDA$ImputationVal),", x[,'",selected,"'])")
} else {
x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.numeric(tempEDA$ImputationVal), x[,selected])
code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, ",as.numeric(tempEDA$ImputationVal),", x[,'",selected,"'])")
}
}
} else if(tempEDA$Class == "character"){
if(trackingFeats == TRUE){
if(tempEDA$LowPropCatLevels != "0"){
LowPropCatLevels <- gsub("'","",unlist(str_split(string = tempEDA$LowPropCatLevels, pattern = ",")))
feat <- paste0("XEC_CatLowProp_",feats[i])
x[,feat] <- ifelse(x[,selected] %in% LowPropCatLevels, 1, 0)
code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(x[,'",selected,"'] %in% c(",tempEDA$LowPropCatLevels,"), 1, 0)")
}
}
if(imputeMissing == TRUE){
if(trackingFeats == TRUE){
feat <- paste0("XEC_Miss_",feats[i])
x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 1, 0)
x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.character(tempEDA$ImputationVal), as.character(x[,selected]))
code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 1, 0)")
code[[length(code) + 1]] <- paste0("x[,'",feats[i],"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, '",tempEDA$ImputationVal,"', as.character(x[,'",selected,"']))")
} else {
x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.character(tempEDA$ImputationVal), as.character(x[,selected]))
code[[length(code) + 1]] <- paste0("x[,'",feats[i],"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, '",tempEDA$ImputationVal,"', as.character(x[,'",selected,"']))")
}
}
}
if(progress == TRUE){
setTxtProgressBar(pb, i)
}
}
if(progress == TRUE){
close(pb)
}
cat("\n")
if(autoCode == FALSE){
return(list(feats = x))
} else {
return(list(feats = x, code = code))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.