R/cleanFeatures.R

Defines functions cleanFeatures

Documented in cleanFeatures

#' Automated data cleaning
#'
#' Cleans features in a dataset for machine learning purposes. Utilizes the edaFrame generated by exploreData. Cleaning involves imputation, clipping outliers and creating tracking features
#'
#' @param x [data.frame | Required] Data.frame containing numeric features to transform
#' @param feats [character vector | Required] Character vector of features to clean
#' @param edaFrame [data.frame | Required] Data.frame object returned by exploreData function
#' @param trackingFeats [logical | Optional] Should tracking features be created. Tracking features are binary features that keep track of data before changes have been applied to, useful for tree type models
#' @param clipOutliers [logical | Optional] Should outliers be clipped using the method specified in the exploreData function
#' @param imputeMissing [logical | Optional] Should features be imputed using median imputation for numerics and mode for categoricals
#' @param progress [logical | Optional] Display progress
#' @param autoCode [logical | Optional] Should code be generated when running the function
#'
#' @return List containing data.frame with cleaned features as well as code when autoCode is TRUE
#' @export
#'
#' @examples
#' eda <- exploreData(iris)
#' cleaned <- cleanFeatures(x = iris, feats = names(iris), edaFrame = eda)
#' @author
#' Xander Horn
cleanFeatures <- function(x, feats, edaFrame, trackingFeats = TRUE, clipOutliers = TRUE, imputeMissing = TRUE,
							progress = FALSE, autoCode = TRUE){


	code <- list()
	if(progress == TRUE){
		pb <- txtProgressBar(min = 0, max = length(feats), style = 3)
	}

	for(i in 1:length(feats)){
		tempEDA <- as.data.frame(subset(edaFrame, edaFrame$Feature == feats[i]))
      	selected <- as.character(tempEDA$Feature)

      	if(tempEDA$Class %in% c("integer","numeric")){
      		if(clipOutliers == TRUE & tempEDA$Type != "Indicator"){
	          	if(trackingFeats == TRUE){

		            feat <- paste0("XEC_Outlier_",feats[i])
		            x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 0,
		            				ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), 1, 0))

		            x[,selected] <- ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), as.numeric(tempEDA$Median), x[,selected])

		            code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 0,
																ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", 1, 0))")
		            code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", ",as.numeric(tempEDA$Median),", x[,'",selected,"'])")

	          	} else {

		            x[,selected] <- ifelse(round(x[,selected],6) < round(tempEDA$LowerOutlierVal,6) | round(x[,selected],6) > round(tempEDA$UpperOutlierVal,6), as.numeric(tempEDA$Median), x[,selected])
		            code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(round(x[,'",selected,"'],6) < ",round(tempEDA$LowerOutlierVal,6)," | round(x[,'",selected,"'],6) > ",round(tempEDA$UpperOutlierVal,6),", ",as.numeric(tempEDA$Median),", x[,'",selected,"'])")

	          	}
        	}

        	if(imputeMissing == TRUE){
	          	if(trackingFeats == TRUE){

		            feat <- paste0("XEC_Miss_",feats[i])
		            x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 1, 0)
		            x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.numeric(tempEDA$ImputationVal), x[,selected])

		            code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 1, 0)")
		            code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, ",as.numeric(tempEDA$ImputationVal),", x[,'",selected,"'])")

	          	} else {

		            x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.numeric(tempEDA$ImputationVal), x[,selected])
		            code[[length(code) + 1]] <- paste0("x[,'",selected,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, ",as.numeric(tempEDA$ImputationVal),", x[,'",selected,"'])")

	          	}
        	}

      	} else if(tempEDA$Class == "character"){

	        if(trackingFeats == TRUE){
	        	if(tempEDA$LowPropCatLevels != "0"){
					LowPropCatLevels <- gsub("'","",unlist(str_split(string = tempEDA$LowPropCatLevels, pattern = ",")))
		          	feat <- paste0("XEC_CatLowProp_",feats[i])
		          	x[,feat] <- ifelse(x[,selected] %in% LowPropCatLevels, 1, 0)

		          	code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(x[,'",selected,"'] %in% c(",tempEDA$LowPropCatLevels,"), 1, 0)")

	        	}
	        }

	        if(imputeMissing == TRUE){
	        	if(trackingFeats == TRUE){

		            feat <- paste0("XEC_Miss_",feats[i])
		            x[,feat] <- ifelse(is.na(x[,selected]) == TRUE, 1, 0)
		            x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.character(tempEDA$ImputationVal), as.character(x[,selected]))

	            	code[[length(code) + 1]] <- paste0("x[,'",feat,"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, 1, 0)")
	            	code[[length(code) + 1]] <- paste0("x[,'",feats[i],"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, '",tempEDA$ImputationVal,"', as.character(x[,'",selected,"']))")

	          	} else {

	            	x[,selected] <- ifelse(is.na(x[,selected]) == TRUE, as.character(tempEDA$ImputationVal), as.character(x[,selected]))
	            	code[[length(code) + 1]] <- paste0("x[,'",feats[i],"'] <- ifelse(is.na(x[,'",selected,"']) == TRUE, '",tempEDA$ImputationVal,"', as.character(x[,'",selected,"']))")

	          	}
	        }
		}
		if(progress == TRUE){
			setTxtProgressBar(pb, i)
		}
	}

	if(progress == TRUE){
		close(pb)
	}
	cat("\n")

	if(autoCode == FALSE){
    	return(list(feats = x))
  	} else {
    	return(list(feats = x, code = code))
 	}
}
XanderHorn/autoML documentation built on Aug. 5, 2020, 11:45 a.m.