R/missEval.R

Defines functions missEval

Documented in missEval

#' Evaluate imputation accuracy and consistency 
#'
#' Compare observed values to imputed values for a single variable
#'
#' @param original original \code{data.frame}
#' @param amputated amputated \code{data.frame}
#' @param imputed imputed \code{data.frame}
#' @param var name of variable for evaluation 
#' @param id unique id in \code{data.frame} 
#'   
#' @details 
#' If a unique \code{id} is supplied, the order of all three data frames is aligned and otherwise the correct sorting is assumed. 
#' The function compares amputated values to imputed values using the Kolmogorov–Smirnov statistic and the standardized RMSE (numerical variables) or the classification error (for variable types character, factor, ordered, logical). 
#' Amputated values are observed in \code{original} but not in \code{amputated}. The RMSE is standardized with the standard deviation observed in \code{original}.
#' While RMSE and classification error measure imputation accuracy,  the Kolmogorov–Smirnov statistic measures imputation consistency.
#'  
#' @return 
#' \code{data.frame}. 
#'
#' @examples 
#' df <- tibble(a=1:5, b=LETTERS[1:5])
#' df_amp <- missMaker(df, var='a', size=3)
#' df_imp <- missMean(df_amp)
#' missEval(df, df_amp, df_imp, var='a')
#' 
#' 
#' @export
missEval <- function(
		original,
		amputated,
		imputed,
		var, 
		id=NULL){

	if(!(nrow(original)==nrow(amputated) & 
		 nrow(original)==nrow(imputed)) ) stop("Data frames with different number of rows are not allowed.")
	if(!(ncol(original)==ncol(amputated) & 
		 	 ncol(original)==ncol(imputed)) ) stop("Data frames with different number of columns are not allowed.")

	# Sort rows and columns
	if(!is.null(id)){
		amputated <- sort_df(amputated,original,id=id)
		imputed <- sort_df(imputed,original,id=id)
	} else { 
		warning("Assuming that all three data frames are sorted.")
	}

	amputated <- amputated[,colnames(original)]
	imputed <- imputed[,colnames(original)]
	
	if(is.character(original[[var]])==TRUE) stop("Character variables can not be used.")

	raw <- original
	idx <- !is.na(original[,var])
	original <- original[idx,]
	amputated <- amputated[idx,]
	imputed <- imputed[idx,]

	obs <- original[is.na(amputated[,var]),]
	imp <- imputed[is.na(amputated[,var]),]

	obs <- obs[[var]]
	imp <- imp[[var]]

  if(length(obs) < 3) stop("Fewer than 2 values observed.")
	
	if( is_cat(obs) ){
		acc <- tibble(metric='pfc', err=pfc_stat(obs, imp))
		ks <- tibble(metric='ks', err=ks_stat(obs, imp))
		smd <- tibble(metric='smd', err=smd_stat(obs, imp))
	} else {
		sd_ <- sd(raw[[var]],na.rm=TRUE)
		acc <- tibble(metric='nrmse', err=rmse_stat(obs,imp)/sd_ )		
		ks <- tibble(metric='ks', err=ks_stat(obs, imp))
		smd <- tibble(metric='smd', err=smd_stat(obs, imp))
	}	
	
	return(bind_rows(acc,ks,smd))
	}
sumtxt/missEval documentation built on July 12, 2020, 12:07 a.m.