#'This function is used to impute new data using training imputer models
#' @param object training imputer object
#' @param newdata a data frame
#' @param pmm.new whether or not to apply predictive mean matching with the new dataset. Default: FALSE
#' @param pmm.k the number of donors for predictive mmean matching. Default: NULL
#' @param m the number of imputed datasets. Default: NULL
#' @export
impute.new<-function(object,newdata,pmm.new=FALSE,pmm.k=NULL,m=NULL){
save.vars=object$params$save.vars
#extract imputer models from the training object
xgb.fit=object$saved.models
#extract params from the training object
if(is.null(pmm.k)){
pmm.k=object$params$pmm.k
}else{
pmm.k=pmm.k
}
if(is.null(m)){
m=object$params$m
}else{
if(m <= object$params$m){
m=m
}else{
stop("The value of m in impute.new() cannot be larger than the value of m in $impute().")
}
}
Names=object$params$Names
type=object$params$type
#new data variables should be in the same order as the training dataset
sorted.idx=object$params$sorted.idx
initial.imp=object$params$initial.imp
pmm.type=object$params$pmm.type
pmm.link=object$params$pmm.link
#
p=ncol(newdata)
Nrow=nrow(newdata)
#sort the newdata according to the sorted order of the training dataset
sorted.df=newdata[Names]
initial.df=sorted.df
num.na=colSums(is.na(sorted.df))
#check new data and give some warning messages (unfinished)
if(all(num.na==0)){
stop("No missing values in new data.")
}
if(!is.null(save.vars)){
nacol.new=Names[which(num.na!=0)]
if(!all(nacol.new %in% save.vars)){
unsaved=nacol.new[which(!nacol.new %in% save.vars)]
msg1=paste("There exists at least one missing value in the following variable(s): ",paste(unsaved,collapse=";"),
".",sep="")
msg2=paste("However, your hadn't specified to save imputation models for these variables.")
msg3=paste("Please either add these variables in the argument save.vars or set save.vars=NULL and re-train the imputer.")
stop(paste(msg1,msg2,msg3,sep="\n"))
}
}
#check if there is only one observation in new data, cannot use pmm.new=TRUE, force to match with the training data instead.
if(Nrow == 1 & pmm.new == TRUE){
stop("There is only one observation in new data.\n No other observations in new data to match.\n Please set pmm.new = FALSE")
}
#check if the number of observations is less than pmm.k when pmm.new=TRUE
if(Nrow <= pmm.k & pmm.new ==TRUE){
stop("The number of observations in new data is less than the value of pmm.k. \n Please either set pmm.new = FALSE or set pmm.k = 1")
}
if(any(num.na == Nrow) & pmm.new == TRUE){
stop("There is at least one variable in new data with 100% missing values.\n No observed values in at least one variable in the new data to match.\n Please set pmm.new = FALSE")
}
if(any(Nrow-num.na < pmm.k) & pmm.new == TRUE){
maxNA=max(num.na)
minObs=Nrow-maxNA
s1=paste("In this dataset, the minimum number of observed values in a variable is ", minObs, ".",sep="")
s2=paste("However, pmm.k=",pmm.k,".",sep="")
if(minObs == 1){
s3=paste("Please either set pmm.new = FALSE or set pmm.k = 1 .")
}else{
s3=paste("Please either set pmm.new = FALSE or set the value of pmm.k less than or equal to ",minObs,".",sep="")
}
stop(paste(s1,s2,s3,sep="\n"))
}
na.newdata=list()
#initial imputation
if(pmm.new==FALSE){
#extract a set of observed training data (use for initial imputation and pmm)
train.df=object$imputed.data[[1]]
Obs.index=object$params$Obs.index
for(i in 1:p){
#location of the missing new data
na.index=which(is.na(sorted.df[,i]))
na.newdata[[i]]=na.index
#location of observed training data
obs.index=Obs.index[[i]]
if(initial.imp=="random"){
initial.df[na.index,i]<-sample(train.df[Names][,i][obs.index],num.na[i],replace=TRUE)
}else if(initial.imp=="rnorm"){
if(type[i]=="numeric"){
var.mean=mean(train.df[Names][,i],na.rm = T)
var.sd=sd(train.df[Names][,i],na.rm = T)
initial.df[na.index,i]<-rnorm(num.na[i],var.mean,var.sd)
}else{
initial.df[na.index,i]<-sample(train.df[Names][,i][obs.index],num.na[i],replace=TRUE)
}
}else{
#numeric: initial impute median
if(type[i]=="numeric"){
initial.df[na.index,i]<-median(train.df[Names][,i],na.rm = T)
}else{
#factor: initial impute major class
initial.df[na.index,i]<-names(which.max(table(train.df[Names][,i])))
}
}
}
}else{
#match to the new dataset
for(i in 1:p){
#location of the missing new data
na.index=which(is.na(sorted.df[,i]))
na.newdata[[i]]=na.index
if(initial.imp=="random"){
if(num.na[i] == Nrow-1){
initial.df[na.index,i]<-rep(sorted.df[,i][-na.index],num.na[i])
}else{
initial.df[na.index,i]<-sample(sorted.df[,i][-na.index],num.na[i],replace=TRUE)
}
}else if(initial.imp=="rnorm"){
if(type[i]=="numeric"){
var.mean=mean(sorted.df[,i],na.rm = T)
if(num.na[i] == Nrow-1){
var.sd=0
}else{
var.sd=sd(sorted.df[,i],na.rm = T)
}
initial.df[na.index,i]<-rnorm(num.na[i],var.mean,var.sd)
}else{
if(num.na[i] == Nrow-1){
initial.df[na.index,i]<-rep(sorted.df[,i][-na.index],num.na[i])
}else{
initial.df[na.index,i]<-sample(sorted.df[,i][-na.index],num.na[i],replace=TRUE)
}
}
}else{
#numeric: initial impute median
if(type[i]=="numeric"){
initial.df[na.index,i]<-median(sorted.df[,i],na.rm = T)
}else{
#factor: initial impute major class
initial.df[na.index,i]<-names(which.max(table(sorted.df[,i])))
}
}
}
}
#### m multiple imputation
if(is.null(pmm.type)){
###no pmm
imputed.data<-list()
for(k in 1:m){
copy=initial.df
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
#only impute columns with missing values
if(p==2){
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])
}else{
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])[,-1]
}
if(length(na.index)==1){
mis.data=t(mis.data)
}
if(type[i]=="numeric"){
pred.y=predict(xgb.fit[[k]][[i]],mis.data)
#update dataset
copy[,i][na.index]<-pred.y
}else if(type[i]=="binary"){
if(length(xgb.fit[[k]][[i]])!=1){
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data)
pred.y=ifelse(xgb.pred>=0.5,1,0)
pred.y=levels(train.df[Names][,i])[pred.y+1]
#update dataset
copy[,i][na.index]<-pred.y
}else{
#skip xgboost training, just impute majority class
copy[,i][na.index]<-xgb.fit[[k]][[i]]
}
}else{
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data)
pred.y=levels(train.df[Names][,i])[xgb.pred+1]
#update dataset
copy[,i][na.index]<-pred.y
}
}
}#end of 1:p
imputed.data[[k]]<-copy[order(sorted.idx)]
}
return(imputed.data)
}else if(pmm.type==1){
####
#multiple imputation m=5
imputed.data<-list()
if(pmm.new==FALSE){
#extract predicted value of observed data in training set
yhatobs.list=object$params$yhatobs.list
#extract original observed data in training data
yobs.list=object$params$yobs.list
}else{
#match to new dataset
yhatobs.list<-list()
yobs.list<-list()
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
obs.y=sorted.df[,i][-na.index]
if(type[i]!="numeric"){
obs.y=as.integer(obs.y)-1
}
yobs.list[[i]]=obs.y
if(p==2){
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])
}else{
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])[,-1]
}
if(length(obs.y)==1){
obs.data=t(obs.data)
warning("There is only one observed entry in an numeric variable in the new data.\n PMM match to new data may not be sensible. \n Recommend to set pmm.new = FALSE ")
}
if(type[i]=="numeric"){
yhatobs=predict(xgb.fit[,1][[1]][[i]],obs.data)
#update dataset
yhatobs.list[[i]]=yhatobs
}else if(type[i]=="binary"){
t=sort(table(obs.y))
#t[1] minority class t[2]majority class
if(!is.na(t[2]) & length(xgb.fit[,1][[1]][[i]])!=1){
#if the new data have more than one class and
#the training data have more than one class
xgb.pred = predict(xgb.fit[,1][[1]][[i]],obs.data)
yhatobs.list[[i]]=xgb.pred
}else{
#if the training data mainly have one class
warning("Majority class of a binary variable in the new dataset was used as the predicted value for predictive mean matching (type 1).\n Because this variable only contain one class in the new dataset.")
#skip getting predicted value using the saved xgboost model, just impute majority class
yhatobs.list[[i]]<-rep(t[1],length(obs.y))
}
}else{
xgb.pred = predict(xgb.fit[,1][[1]][[i]],obs.data,reshape = T)
if(pmm.link=="logit"){
xgb.pred<-log(xgb.pred/(1-xgb.pred))
}
yhatobs.list[[i]]=xgb.pred
}
}
}
}#end of match to new data for pmm type 1
for(k in 1:m){
copy=initial.df
for(i in 1:p){
########
###########
na.index=na.newdata[[i]]
if(length(na.index)>0){
if(p==2){
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[na.index,])
}else{
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[na.index,])[,-1]
}
if(length(na.index)==1){
mis.data=t(mis.data)
}
if(type[i]=="numeric"){
pred.y=predict(xgb.fit[,2][[k]][[i]],mis.data)
#update dataset
copy[,i][na.index]<- pmm(yhatobs = yhatobs.list[[i]],yhatmis = pred.y,yobs=yobs.list[[i]],k=pmm.k)
}else if(type[i]=="binary"){
if(length(xgb.fit[,2][[k]][[i]])!=1){
xgb.pred = predict(xgb.fit[,2][[k]][[i]],mis.data)
num.result=pmm(yhatobs = yhatobs.list[[i]],yhatmis = xgb.pred,yobs=yobs.list[[i]],k=pmm.k)
#change to factor
copy[,i][na.index]<- levels(train.df[Names][,i])[num.result+1]
}else{
#skip xgboost training, just impute majority class
copy[,i][na.index]<-xgb.fit[,2][[k]][[i]]
}
}else{
xgb.pred= predict(xgb.fit[,2][[k]][[i]],mis.data,reshape = T)
if(pmm.link=="logit"){
xgb.pred=log(xgb.pred/(1-xgb.pred))
}
num.result=pmm.multiclass(donor.pred = yhatobs.list[[i]],target.pred = xgb.pred,donor.obs = yobs.list[[i]],k=pmm.k)
#change to factor
copy[,i][na.index]<- levels(train.df[Names][,i])[num.result+1]
}
}
}
imputed.data[[k]]<-copy[order(sorted.idx)]
}#end m multiple imputation
return(imputed.data)
}else if(pmm.type==2){
###########
imputed.data<-list()
if(pmm.new==FALSE){
#extract predicted value of observed data in training set
yhatobs.list=object$params$yhatobs.list
#extract original observed data in training data
yobs.list=object$params$yobs.list
}else{
#match to new dataset
yhatobs.list<-replicate(m,list())
yobs.list<-list()
#store observed values in the new dataset
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
obs.y=sorted.df[,i][-na.index]
if(type[i]!="numeric"){
obs.y=as.integer(obs.y)-1
}
yobs.list[[i]]=obs.y
}
}
#store predicted values of observed entries from new data
for(k in 1:m){
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
if(p==2){
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])
}else{
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])[,-1]
}
if(length(yobs.list[[i]])==1){
obs.data=t(obs.data)
warning("There is only one observed entry in an numeric variable in the new data.\n PMM match to new data may not be sensible. \n Recommend to set pmm.new = FALSE ")
}
if(type[i]=="numeric"){
yhatobs=predict(xgb.fit[[k]][[i]],obs.data)
#update dataset
yhatobs.list[[k]][[i]]=yhatobs
}else if(type[i]=="binary"){
t=sort(table(yobs.list[[i]]))
#t[1] minority class t[2]majority class
if(!is.na(t[2]) & length(xgb.fit[[k]][[i]])!=1){
#if the new data have more than one class and
#the training data have more than one class
xgb.pred = predict(xgb.fit[[k]][[i]],obs.data)
yhatobs.list[[k]][[i]]=xgb.pred
}else{
#if the training data mainly have one class
warning("Majority class of a binary variable in the new dataset was used as the predicted value
for predictive mean matching (type 2) because the variable in the training dataset or new dataset mainly contain one class")
#skip getting predicted value using the saved xgboost model, just impute majority class
yhatobs.list[[k]][[i]]<-rep(t[1],length(obs.y))
}
}else{
xgb.pred = predict(xgb.fit[[k]][[i]],obs.data,reshape = T)
if(pmm.link=="logit"){
xgb.pred<-log(xgb.pred/(1-xgb.pred))
}
yhatobs.list[[k]][[i]]=xgb.pred
}
}
}
}
}
for(k in 1:m){
copy=initial.df
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
if(p==2){
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])
}else{
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])[,-1]
}
if(length(na.index)==1){
mis.data=t(mis.data)
}
if(type[i]=="numeric"){
pred.y=predict(xgb.fit[[k]][[i]],mis.data)
###use boostrap observed data to fit model
#use this model to predict all missing values in new data and match with all observed values in new data
#update dataset
copy[,i][na.index]<-pmm(yhatobs = yhatobs.list[[k]][[i]],yhatmis = pred.y,yobs=yobs.list[[i]],k=pmm.k)
}else if(type[i]=="binary"){
if(length(xgb.fit[[k]][[i]])!=1){
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data)
num.result=pmm(yhatobs = yhatobs.list[[k]][[i]],yhatmis = xgb.pred,yobs=yobs.list[[i]],k=pmm.k)
#change to factor
copy[,i][na.index]<- levels(train.df[Names][,i])[num.result+1]
}else{
#skip xgboost training, just impute majority class
copy[,i][na.index]<-xgb.fit[[k]][[i]]
}
}else{
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data,reshape = T)
if(pmm.link=="logit"){
xgb.pred=log(xgb.pred/(1-xgb.pred))
}
num.result=pmm.multiclass(donor.pred = yhatobs.list[[k]][[i]],target.pred = xgb.pred,donor.obs = yobs.list[[i]],k=pmm.k)
#change to factor
copy[,i][na.index]<- levels(train.df[Names][,i])[num.result+1]
}
}
}
imputed.data[[k]]<-copy[order(sorted.idx)]
}#end of k loops
return(imputed.data)
}else if(pmm.type=="auto"){
imputed.data<-list()
if(pmm.new==FALSE){
#extract predicted value of observed data in training set
yhatobs.list=object$params$yhatobs.list
#extract original observed data in training data
yobs.list=object$params$yobs.list
}else{
#match to new dataset
yhatobs.list<-replicate(m,list())
yobs.list<-list()
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
obs.y=sorted.df[,i][-na.index]
if(type[i]!="numeric"){
obs.y=as.integer(obs.y)-1
}
yobs.list[[i]]=obs.y
}
}
for(k in 1:m){
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
if(p==2){
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])
}else{
obs.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=initial.df[-na.index,])[,-1]
}
if(length(yobs.list[[i]])==1){
obs.data=t(obs.data)
}
if(type[i]=="numeric"){
yhatobs=predict(xgb.fit[[k]][[i]],obs.data)
if(length(yhatobs)==1){
warning("There is only one observed entry in an numeric variable in the new data.\n PMM match to new data may not be sensible. \n Recommend to set pmm.new = FALSE .")
}
#update dataset
yhatobs.list[[k]][[i]]=yhatobs
}#for factor variables don't need to do PMM
}
}
}#end of k loops
}#end of when pmm.new=TRUE, obtain yobs.list and yhatobs.list
#impute
for(k in 1:m){
copy=initial.df
for(i in 1:p){
na.index=na.newdata[[i]]
if(length(na.index)>0){
if(p==2){
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])
}else{
mis.data=sparse.model.matrix(as.formula(paste(Names[i],"~.",sep="")),data=copy[na.index,])[,-1]
}
if(length(na.index)==1){
mis.data=t(mis.data)
}
if(type[i]=="numeric"){
#pmm type 2 for continuous variables
pred.y=predict(xgb.fit[[k]][[i]],mis.data)
#update dataset
copy[,i][na.index]<-pmm(yhatobs = yhatobs.list[[k]][[i]],yhatmis = pred.y,yobs=yobs.list[[i]],k=pmm.k)
}else if(type[i]=="binary"){
if(length(xgb.fit[[k]][[i]])!=1){
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data)
pred.y=ifelse(xgb.pred>=0.5,1,0)
#update dataset
copy[,i][na.index]<-levels(train.df[Names][,i])[pred.y+1]
}else{
#skip xgboost training, just impute majority class
copy[,i][na.index]<-xgb.fit[[k]][[i]]
}
}else{
xgb.pred = predict(xgb.fit[[k]][[i]],mis.data)
#update dataset
copy[,i][na.index]<-levels(train.df[Names][,i])[xgb.pred+1]
}
}
}#end of p loops
imputed.data[[k]]<-copy[order(sorted.idx)]
}#end of k loops
return(imputed.data)
}#end of pmm.type="auto"
}#end of impute.new function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.