Nothing
####Classical hotdeck methods
#Author: Alexander Kowarik, Statistics Austria
## Sequential hot(cold)deck
## Random (within domain) hot(cold)deck
## Cold deck is not implemented yet
#data - data.frame of the data with missing
#variable - vector of variablesnames to be imputed
#ord_var - list of vectors of variablesnames to be used to order the dataset
#domain_var - vector of variablesnames to be used as domains
#makeNA - vector of values which should be imputed too e.g. 8,9 or 98,99 in SPSS-data sets
#NAcond - list of conditions for each variable to create NAs there (not yet implemented)
#donorcond - list of conditions for a donor e.g. "<=10000"
#TODO: Donors from cold deck
#' Hot-Deck Imputation
#'
#' Implementation of the popular Sequential, Random (within a domain) hot-deck
#' algorithm for imputation.
#'
#'
#' @param data data.frame or matrix
#' @param variable variables where missing values should be imputed (not overlapping with ord_var)
#' @param ord_var variables for sorting the data set before imputation (not overlapping with variable)
#' @param domain_var variables for building domains and impute within these
#' domains
#' @param makeNA list of length equal to the number of variables, with values, that should be converted to NA for each variable
#' @param NAcond list of length equal to the number of variables, with a condition for imputing a NA
#' @param impNA TRUE/FALSE whether NA should be imputed
#' @param donorcond list of length equal to the number of variables, with a donorcond condition as character string.
#' e.g. ">5" or c(">5","<10). If the list element for a variable is NULL no condition will be applied for this variable.
#' @param imp_var TRUE/FALSE if a TRUE/FALSE variables for each imputed
#' variable should be created show the imputation status
#' @param imp_suffix suffix for the TRUE/FALSE variables showing the imputation
#' status
#' @return the imputed data set.
#' @author Alexander Kowarik
#' @note If the sequential hotdeck does not lead to a suitable,
#' a random donor in the group will be used.
#' @references A. Kowarik, M. Templ (2016) Imputation with
#' R package VIM. *Journal of
#' Statistical Software*, 74(7), 1-16.
#' @keywords manip
#' @family imputation methods
#' @examples
#'
#' data(sleep)
#' sleepI <- hotdeck(sleep)
#' sleepI2 <- hotdeck(sleep,ord_var="BodyWgt",domain_var="Pred")
#'
#' # Usage of donorcond in a simple example
#' sleepI3 <- hotdeck(
#' sleep,
#' variable = c("NonD", "Dream", "Sleep", "Span", "Gest"),
#' ord_var = "BodyWgt", domain_var = "Pred",
#' donorcond = list(">4", "<17", ">1.5", "%between%c(8,13)", ">5")
#' )
#'
#' set.seed(132)
#' nRows <- 1e3
#' # Generate a data set with nRows rows and several variables
#' x <- data.frame(
#' x = rnorm(nRows), y = rnorm(nRows),
#' z = sample(LETTERS, nRows, replace = TRUE),
#' d1 = sample(LETTERS[1:3], nRows, replace = TRUE),
#' d2 = sample(LETTERS[1:2], nRows, replace = TRUE),
#' o1 = rnorm(nRows), o2 = rnorm(nRows), o3 = rnorm(100)
#' )
#' origX <- x
#' x[sample(1:nRows,nRows/10), 1] <- NA
#' x[sample(1:nRows,nRows/10), 2] <- NA
#' x[sample(1:nRows,nRows/10), 3] <- NA
#' x[sample(1:nRows,nRows/10), 4] <- NA
#' xImp <- hotdeck(x,ord_var = c("o1", "o2", "o3"), domain_var = "d2")
#'
#'
#' @export
hotdeck <- function(data , variable=NULL, ord_var=NULL,domain_var=NULL,
makeNA=NULL,NAcond=NULL,impNA=TRUE,donorcond=NULL,
imp_var=TRUE,imp_suffix="imp"
){
check_data(data)
if(!is.null(variable)&&!is.null(ord_var)){
if(length(intersect(ord_var,variable))>0){
stop(paste0(intersect(ord_var,variable),collapse=", "),
" should not be in the parameters ord_var and variable.
Since this can lead to unforeseen results and errors.")
}
}
OriginalSortingVariable <- impvar <- NULL #empty init
if(is.null(variable)){
variable <- colnames(data)
variable<-variable[!variable%in%c(ord_var,domain_var)]
}
if(!is.null(makeNA)){
if(!is.list(makeNA)||!length(makeNA)==length(variable))
stop("makeNA is not defined correctly. \n It should be a list of length equal to the length of the argument 'variable'.")
}
classx <- class(data)
VariableSorting <- colnames(data)
data$OriginalSortingVariable <- 1:nrow(data)
data <- data.table(data)
if(is.null(variable)){
variable <- colnames(data)[apply(is.na(data),2,any)]
}
if(!is.null(NAcond))
warning("NAcond is not implemented yet and will be ignored.")
classWithoutLabelled <- function(x){
cl <- class(x)
return(cl[cl!="labelled"])
}
varType <- sapply(data,classWithoutLabelled)[variable]
if(imp_var){
for(v in variable){
data[,impvar:=FALSE]
impvarname <- paste(v,"_",imp_suffix,sep="")
setnames(data,"impvar",impvarname)
VariableSorting <- c(VariableSorting,impvarname)
}
}
# If no ord_var is defined, a random ordered will be used
if(is.null(ord_var)){
RandomVariableForImputationWithHotdeck <- NULL # Init for CRAN check
nrowXforRunif <- nrow(data)
data[,RandomVariableForImputationWithHotdeck:=runif(nrowXforRunif)]
ord_var <- "RandomVariableForImputationWithHotdeck"
}
setkeyv(data,ord_var)
# if no domain_var is defined, the imputeHD function is automatically called on the
# whole data set
data <- data[,imputeHD(.SD,variableX=variable,varTypeX=varType,
imp_varX=imp_var,imp_suffixX=imp_suffix,impNAX=impNA,makeNAX=makeNA,
ord_varX = ord_var, donorcond = donorcond), by = domain_var]
if(any(ord_var=="RandomVariableForImputationWithHotdeck")){
data[,RandomVariableForImputationWithHotdeck:=NULL]
ord_var <- NULL
}
setkey(data,OriginalSortingVariable)
data[,OriginalSortingVariable:=NULL]
if(all(classx!="data.table"))
return(as.data.frame(data)[,VariableSorting,drop=FALSE])
return(data[,VariableSorting,with=FALSE])
}
## xx should be a data.table and ord_var the name of variables to sort
imputeHD <- function(xx,variableX,varTypeX,imp_varX,imp_suffixX,
impNAX,makeNAX, ord_varX, donorcond){
donor_applicable <- OriginalSortingVariable <- weirdandlongname <- UniqueIdForImputation <- NULL#empty init
J <- function()NULL#empty init
xx$UniqueIdForImputation <- 1:nrow(xx)
prevKey <- key(xx)
for(v in variableX){
xx[, donor_applicable := !is.na(xx[[v]])]
if (!is.null(donorcond)) {
if(!is.null(donorcond[[match(v,variableX)]])){
condition_string <- paste0("xx[[v]]", donorcond[[match(v,variableX)]],collapse="&")
TF <- eval(parse(text=condition_string))
xx[, donor_applicable := donor_applicable & TF]
}
}
if(!impNAX){
setkeyv(xx,v)
if(is.null(makeNAX))
stop("If impNA=FALSE a list of values to be imputed must be provided.")
## NAs should not be imputed
if(varTypeX[v]%in%c("numeric","integer")){
NAs <- xx[J(NA_real_),.I,nomatch=FALSE]# get the Index of the NAS
}else{
NAs <- xx[J(NA_character_),.I,nomatch=FALSE]# get the Index of the NAS
}
#NAs hold the index of observations with NAs in the current variable
if(length(NAs)>0){
xxna <- xx[NAs] # move observation to a temp data set
xx <- xx[-NAs] # just keep the non NA obs
}else{#if no NA xx is unchanged and xxna is just an empty data.table
xxna <- data.table()
}
setkeyv(xx,prevKey)
xx$UniqueIdForImputation <- 1:nrow(xx)
}
if(!is.null(makeNAX)){
# eval(parse(text="xx[xx>1]"))
setnames(xx,v,"weirdandlongname")
xx[weirdandlongname%in%makeNAX[[match(v,variableX)]],weirdandlongname:=NA]
setnames(xx,"weirdandlongname",v)
}
setkeyv(xx,v)
if(varTypeX[v]%in%c("numeric","integer")){
impPart <- xx[J(NA_real_),UniqueIdForImputation,nomatch=FALSE]#$UniqueIdForImputation
}else{
impPart <- xx[J(NA_character_),UniqueIdForImputation,nomatch=FALSE]#$UniqueIdForImputation
}
if((length(impPart)>0)&&(length(impPart)<nrow(xx))){
if(imp_varX){
impvarname <- paste(v,"_",imp_suffixX,sep="")
xx[UniqueIdForImputation%in%impPart,c(impvarname):=TRUE]
}
impDon <- impPart-1
impDon[impDon<1] <- impPart[impDon<1]+1
setkey(xx,UniqueIdForImputation)
Don <- data.frame(xx[impDon,v,with=FALSE])[,1]
TFindex <- xx[impDon, !donor_applicable]
TF <- any(TFindex)
if(TF){
add <- 2
while(TF){
impDon[TFindex] <- impPart[TFindex]-add
if(any(impDon[TFindex][impDon[TFindex]<1]<=-nrow(xx))){
impDon[TFindex][impDon[TFindex]<1][impDon[TFindex][impDon[TFindex]<1]<=-nrow(xx)] <-
- impDon[TFindex][impDon[TFindex]<1][impDon[TFindex][impDon[TFindex]<1]<=-nrow(xx)]
}
impDon[TFindex][impDon[TFindex]<1] <- impPart[TFindex][impDon[TFindex]<1]-add+nrow(xx)
impDon2 <- impDon[TFindex]
Don[TFindex] <- data.frame(xx[impDon2,v,with=FALSE])[,1]
TFindex[TFindex] <- xx[impDon2, !donor_applicable]
TF <- any(TFindex)
if(add>min(50, nrow(xx))){
TF <- FALSE
# remaining missing values will be set to a random value from the group
if(length(Don)>0){
Don[TFindex] <- Don[!TFindex][sample(sum(!TFindex),1)]
}
if(!identical(ord_varX, "RandomVariableForImputationWithHotdeck")){
warning(paste("For variable",v,"the ordering is ignored for at least one imputation."))
}
}
add <- add +1
}
}
xx[impPart,v] <- Don
}
if(!impNAX)
xx <- rbindlist(list(xx,xxna))
}
xx[,UniqueIdForImputation:=NULL]
setkey(xx,OriginalSortingVariable)
return(xx)
}
#require(data.table)
#setwd("/Users/alex")
#Rprof("profile1.out")
###TEST
#set.seed(132)
#nRows <- 1e6
#x<-data.frame(x=rnorm(nRows),y=rnorm(nRows),z=sample(LETTERS,nRows,rep=T),
# d1=sample(LETTERS[1:3],nRows,rep=T),d2=sample(LETTERS[1:2],nRows,rep=T),o1=rnorm(nRows),o2=rnorm(nRows),o3=rnorm(100))
#origX <- x
#x[sample(1:nRows,nRows/10),1] <- NA
#x[sample(1:nRows,nRows/10),2] <- NA
#x[sample(1:nRows,nRows/10),3] <- NA
#x[sample(1:nRows,nRows/10),4] <- NA
##
#xImp <- hotdeck_work2(x,ord_var = c("o1","o2","o3"),domain_var="d2")
#Rprof(NULL)
#summaryRprof("profile1.out")
#xImp1 <- hotdeck(x,ord_var = c("o1","o2","o3"),domain_var="d2")
#identical(xImp,xImp1)
#
#
#for(v in colnames(xImp)){
# print(v)
# print(identical(xImp[,v],xImp1[,v]))
#}
#
#
#
#require(microbenchmark)
#res <- microbenchmark(xImp <- hd(x,ord_var = c("o1","o2","o3"),domain_var="d2"),times=10)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.