#' @title Impute missing values
#' @description
#' This function uses heuristics to decide what is the most appropriate method for value imputation.
#'
#' @param df is the dataframe containing missing values
#' @param col is the index of the dataframe containing missing values
#' @param mean is a True or False parameter
#' @param median is a True or False parameter
#' @param linear is a True or False parameter
#' @return the RMSE
#' @export
#'
impute <-function(df,col,mean,median,linear){
if(class(df[,col]) %in% c('factor','character')){
stop('Value is not numeric or integer, stopping operation')
}
test <- df[complete.cases(df),]
testCases <- test[sample(nrow(test), nrow(test)*.1), ]
trainCases <- test[-c(as.numeric(rownames(testCases))),]
MeanVal <- mean(trainCases[,col])
MedianVal <- median(trainCases[,col])
form <- paste(colnames(df)[col],'~.')
mod <- lm(as.formula(form),data=trainCases)
Predictions <- predict(mod,testCases)
blended <- (Predictions*.33+(MeanVal*.33)+(MedianVal*.33))
MeanVal <- data.frame(Method = 'Mean',MSE = MSE(testCases[,col],MeanVal), MASE = MAE(testCases[,col],MeanVal),RMSE = RMSE(testCases[,col],MeanVal))
MedianVal <- data.frame(Method = 'Median',MSE = MSE(testCases[,col],MedianVal), MASE = MAE(testCases[,col],MedianVal),RMSE = RMSE(testCases[,col],MedianVal))
LinearModel <- data.frame(Method = 'LinearModel',MSE = MSE(testCases[,col],Predictions), MASE = MAE(testCases[,col],Predictions),RMSE = RMSE(testCases[,col],Predictions))
BlendedModel <- data.frame(Method = 'BlendedModel',MSE = MSE(testCases[,col],blended), MASE = MAE(testCases[,col],blended),RMSE = RMSE(testCases[,col],blended))
output <- rbind(MeanVal,MedianVal,LinearModel,BlendedModel)
bestOption <- as.character(output[which(output$RMSE == min(output$RMSE)),1])
print(paste('Based upon using a sample of 10% of the complete cases the imputer ran all three methods and believes',bestOption, 'is the best method'))
print(output)
if(mean==T){
print('imputing mean')
df[which(is.na(df[,col])==T),][col] <- mean(df[which(is.na(df[,col])==F),][,col])
} else if(median==T){
print('imputing median')
df[which(is.na(df[,col])==T),][col] <- median(df[which(is.na(df[,col])==F),][,col])
} else{
print('imputing regression')
form <- paste(colnames(df)[col],'~.')
mod <- lm(as.formula(form),data=df[which(is.na(df[,col])==F),])
df[which(is.na(df[,col])==T),][col] <- predict(mod,df[which(is.na(df[,col])==T),][-col])
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.