Nothing
#' Test of missing data mechanism using complete data
#'
#' This function tests the missing completely at random (MCAR) vs missing at random (MAR) by using the complete variables only.
#' @usage RBtest(data)
#' @param data Dataset with at least one complete variable. The variables could be either continuous, categorical or a mix of both.
#'
#' @return A list of the following elements:
#' \itemize{
#' \item \code{abs.nbrMD} The absolute number of missing data per variable.
#' \item \code{rel.nbrMD} The percentage of missing data per variable.
#' \item \code{type} Vector of the same length than the number of variables of the dataset, where '0' is for variables with MCAR data, '1' is for variables with MAR data and '-1' is for complete variables.
#' }
#'
#' @examples
#'
#' set.seed(60)
#' n<-100 # sample size
#' r<-5 # number of variables
#' mis<-0.2 # frequency of missing data
#' mydata<-matrix(NA, nrow=n, ncol=r) # mydata is a matrix of r variables
#' # following a U(0,1) distribution
#' for (i in c(1:r)){
#' mydata[,i]<-runif(n,0,1)
#' }
#' bin.var<-sample(LETTERS[1:2],n,replace=TRUE, prob=c(0.3,0.7)) # binary variable [A,B].
#' # The probability of being in one of the categories is 0.3.
#' cat.var<-sample(LETTERS[1:3],n,replace=TRUE, prob=c(0.5,0.3,0.2)) # categorical variable [A,B,C].
# The vector of probabilities of occurence A, B and C is (0.5,0.3,0.7).
#' num.var<-runif(n,0,1) # Additional continuous variable following a U(0,1) distribution
#' mydata<-cbind.data.frame(mydata,bin.var,cat.var,num.var,stringsAsFactors = TRUE)
#' # dataframe with r+3 variables
#' colnames(mydata)=c("v1","v2","X1","X2","X3","X4","X5", "X6") # names of columns
#' # MCAR on X1 and X4 by using v1 and v2. MAR on X3 and X5 by using X2 and X6.
#' mydata$X1[which(mydata$v1<=sort(mydata$v1)[mis*n])]<-NA # X1: (mis*n)% of MCAR data.
#' # All data above the (100-mis)th percentile in v1 are selected
#' # and the corresponding observations in X1 are replaced with missing data.
#' mydata$X3[which(mydata$X2<=sort(mydata$X2)[mis*n])]<-NA # X3: (mis*n)% of MAR data.
#' # All data above the (100-mis)th percentile in X2 are selected
#' # and the corresponding observations in X3 are replaced with missing data.
#' mydata$X4[which(mydata$v2<=sort(mydata$v2)[mis*n])]<-NA # X4: (mis*n)% of MCAR data.
#' # All data above the (100-mis)th percentile in v2 are selected
#' # and the corresponding observations in X4 are replaced with missing data.
#' mydata$X5[which(mydata$X6<=sort(mydata$X6)[mis*n])]<-NA # X5: (mis*n)% of MAR data.
#' # All data above the (100-mis)th percentile in X6 are selected
#' # and the corresponding observations in X5 are replaced with missing data.
#' mydata$v1=NULL
#' mydata$v2=NULL
#' RBtest(mydata)
#'
#' @importFrom stats complete.cases ks.test lm p.adjust predict
#' @import nnet psych
#' @export
RBtest<-function(data){
Type.1<-rep(NA,ncol(data))
for (h in which(complete.cases(t(data))==FALSE)){ #only test the missing data on variables which contains missing data (logic)
U=data.frame(data[,h],data[ ,colSums(is.na(data)) == 0],stringsAsFactors = TRUE) # we use for the first test only the complete variables
#names(U1)[1] <- names(D)[h]
new_A=data.frame(U[is.na(U[,1])==FALSE,],stringsAsFactors = TRUE) # "A" part (complete cases)
new_B=data.frame(U[is.na(U[,1])==TRUE,],stringsAsFactors = TRUE) # "B" part
names(new_A)[1]="DV"
# if the variable is binary (2 category) :
if (is.factor(new_A$DV)==TRUE && length(levels(new_A$DV))==2) {
reg_A=multinom(DV~.,trace=FALSE,data=new_A) #trace argument for silent computation.
names(new_A)[1]=names(data)[h]
u_obs_hat=reg_A$fitted.values
u_obs_hat=data.frame(u_obs_hat, (1-u_obs_hat),stringsAsFactors = TRUE)
u_mis_hat=predict(reg_A, new_B, "probs")
u_mis_hat=data.frame(u_mis_hat, (1-u_mis_hat),stringsAsFactors = TRUE)
#-- ks.test --#
## BH adjustement!
RES=p.adjust(c(ks.test(u_obs_hat[,1], u_mis_hat[,1])$p.value,
ks.test(u_obs_hat[,2], u_mis_hat[,2])$p.value),
method = "BH", n = 2)
# >0.05 --> accept MCAR
# And then assign the right value to Type.1
if(RES[1]>0.05 && RES[2]>0.05) {
Type.1[h]=0
} else {
Type.1[h]=1
}
names(U)[1] <- names(data)[h]
}
# if the variable is categorical with more than two categories :
names(new_A)[1]="DV"
if (is.factor(new_A$DV)==TRUE && length(levels(new_A$DV))>2) {
reg_A=multinom(DV~.,trace=FALSE,data=new_A) #trace argument for silent computation.
names(new_A)[1]=names(data)[h]
u_obs_hat=reg_A$fitted.values
u_mis_hat=predict(reg_A, new_B, "probs")
#-- ks.test --#
# First, compute the p-values of the ks.test for each category of the variable of interest
KolSmir<-rep(NA,ncol(u_obs_hat))
for (i in 1:ncol(u_obs_hat)){
KolSmir[i]<-ks.test(u_obs_hat[,i], u_mis_hat[,i])$p.value
}
# And then --> BH adjustement!
RES=p.adjust(KolSmir, method = "BH", n = ncol(u_obs_hat))
# >0.05 --> accept MCAR
# And then assign the right value to Type.1
if (any(RES<0.05)==FALSE){
Type.1[h]=0
} else {
Type.1[h]=1
}
names(U)[1] <- names(data)[h]
}
# if the variable is continuous :
names(new_A)[1]="DV"
if (is.factor(new_A$DV)==FALSE) {
reg_A=lm(DV~.,data=new_A)
names(new_A)[1]=names(data)[h]
u_obs_hat=predict(reg_A, new_A) # prediction of x1_obs
u_mis_hat=predict(reg_A, new_B) # prediction of x1_mis
#-- ks.test --#
RES=ks.test(u_obs_hat, u_mis_hat)$p.value
# >0.05 --> accept MCAR
if(RES>0.05) Type.1[h]=0
if(RES<=0.05) Type.1[h]=1
names(U)[1] <- names(data)[h]
}
}
Type.1[which(is.na(Type.1))]=-1
type<-Type.1
names(type)<-names(data)
abs_nbrMD<-colSums(is.na(data))
rel_nbrMD<-colSums(is.na(data))/(nrow(data))
return(list(abs.nbrMD=round(abs_nbrMD,0),rel.nbrMD=round(rel_nbrMD,2), type=type))
}
#' Test of missing data mechanism using all available information
#'
#' This function tests MCAR vs MAR by using both the complete and incomplete variables.
#' @usage RBtest.iter(data,K)
#' @param data Dataset with at least one complete variable.
#' @param K Maximum number of iterations.
#'
#' @return A list of the following elements:
#' \itemize{
#' \item \code{abs.nbrMD} The absolute quantity of missing data per variable.
#' \item \code{rel.nbrMD} The percentage of missing data per variable.
#' \item \code{K} The maximum admitted number of iterations.
#' \item \code{iter} The final number of iterations.
#' \item \code{type.final} Vector of the same length than the number of variables of the dataset, where '0' is for variables with MCAR data, '1' is for variables with MAR data and '-1' is for complete variables.
#' \item \code{TYPE.k} Dataframe containing the type of missing data after each iteration. Each row is a vector having the same length than the number of variables of the dataset, where '0' is for variables with MCAR data, '1' is for variables with MAR data and '-1' is for complete variables.
#' }
#'
#' @examples
#'
#' set.seed(60)
#' n<-100 # sample size
#' r<-5 # number of variables
#' mis<-0.2 # frequency of missing data
#' mydata<-matrix(NA, nrow=n, ncol=r) # mydata is a matrix of r variables
#' # following a U(0,1) distribution
#' for (i in c(1:r)){
#' mydata[,i]<-runif(n,0,1)
#' }
#' bin.var<-sample(LETTERS[1:2],n,replace=TRUE, prob=c(0.3,0.7)) # binary variable [A,B].
#' # The probability of being in one of the categories is 0.3.
#' cat.var<-sample(LETTERS[1:3],n,replace=TRUE, prob=c(0.5,0.3,0.2)) # categorical variable [A,B,C].
#' # The vector of probabilities of occurence A, B and C is (0.5,0.3,0.7).
#' num.var<-runif(n,0,1) # Additional continuous variable following a U(0,1) distribution
#' mydata<-cbind.data.frame(mydata,bin.var,cat.var,num.var,stringsAsFactors = TRUE)
#' # dataframe with r+3 variables
#' colnames(mydata)=c("v1","v2","X1","X2","X3","X4","X5", "X6") # names of columns
#' # MCAR on X1 and X4 by using v1 and v2. MAR on X3 and X5 by using X2 and X6.
#' mydata$X1[which(mydata$v1<=sort(mydata$v1)[mis*n])]<-NA # X1: (mis*n)% of MCAR data.
#' # All data above the (100-mis)th percentile in v1 are selected
#' # and the corresponding observations in X1 are replaced with missing data.
#' mydata$X3[which(mydata$X2<=sort(mydata$X2)[mis*n])]<-NA # X3: (mis*n)% of MAR data.
#' # All data above the (100-mis)th percentile in X2 are selected
#' # and the corresponding observations in X3 are replaced with missing data.
#' mydata$X4[which(mydata$v2<=sort(mydata$v2)[mis*n])]<-NA # X4: (mis*n)% of MCAR data.
#' # All data above the (100-mis)th percentile in v2 are selected
#' # and the corresponding observations in X4 are replaced with missing data.
#' mydata$X5[which(mydata$X6<=sort(mydata$X6)[mis*n])]<-NA # X5: (mis*n)% of MAR data.
#' # All data above the (100-mis)th percentile in X6 are selected
#' # and the corresponding observations in X5 are replaced with missing data.
#' mydata$v1=NULL
#' mydata$v2=NULL
#'
#' RBtest.iter(mydata,5)
#'
#' @importFrom stats complete.cases ks.test lm p.adjust predict
#' @import nnet mice psych
#' @export
RBtest.iter<-function(data,K){
#Type.1 <- TestMDM(data)
Type.1 <- RBtest(data)$type
D1<-data
TYPE_k=data.frame(matrix(NA, ncol=ncol(data)+1, nrow=K+1),stringsAsFactors = TRUE) #initialisation of the sequences of missing data mechanisms for each iteration
Type.2<-rep(NA,ncol(data))
k=1 # k is the number of iterations for the while loop. This is for excluding the case of infinite loop.
TYPE_k[1,]=c(k,Type.1)
Type.km1 <-rep(-2,(ncol(data)+1))
while(all(TYPE_k[k,2:ncol(TYPE_k)]==Type.km1[2:length(Type.km1)])==FALSE) {
for (h in which(complete.cases(t(data))==FALSE)){ #only test the missing data on variables which contains missing data (logic)
U=data.frame(data[,h],D1[,-h],stringsAsFactors = TRUE) # first on v_h from the original dataset
names(U)[1] <- names(data)[h]
new_A=data.frame(U[is.na(U[,1])==FALSE,],stringsAsFactors = TRUE) # "A" part
new_B=data.frame(U[is.na(U[,1])==TRUE,],stringsAsFactors = TRUE) # "B" part
names(new_A)[1]="DV"
# if the variable is binary (2 category) :
if (is.factor(new_A$DV)==TRUE && length(levels(new_A$DV))==2) {
reg_A=multinom(DV~.,trace=FALSE,data=new_A) #trace argument for silent computation.
names(new_A)[1]=names(data)[h]
u_obs_hat=reg_A$fitted.values
u_obs_hat=data.frame(u_obs_hat, (1-u_obs_hat),stringsAsFactors = TRUE)
u_mis_hat=predict(reg_A, new_B, "probs")
u_mis_hat=data.frame(u_mis_hat, (1-u_mis_hat),stringsAsFactors = TRUE)
#-- ks.test --#
## BH adjustement!
RES=p.adjust(c(ks.test(u_obs_hat[,1], u_mis_hat[,1])$p.value,
ks.test(u_obs_hat[,2], u_mis_hat[,2])$p.value),
method = "BH", n = 2)
# >0.05 --> accept MCAR
# And then assign the right value to Type.1
if(RES[1]>0.05 && RES[2]>0.05) {
Type.2[h]=0
} else {
Type.2[h]=1
}
names(U)[1] <- names(data)[h]
### Imputation of v_h with respect to the result!
if (RES[1]>0.05 && RES[2]>0.05) {
D1[is.na(data[,h]),h]<-sample(D1[!is.na(data[,h]),h],length(D1[is.na(data[,h]),h]),replace=TRUE)
} else {
U=complete(mice(U, m = 1, maxit=5, printFlag=FALSE))
#printFlag option for silent computation
D1[,h]=U[,1]
}
}
names(new_A)[1]="DV"
# if the variable is categorical with more than two categories :
if (is.factor(new_A$DV)==TRUE && length(levels(new_A$DV))>2) {
reg_A=multinom(DV~.,trace=FALSE,data=new_A) #trace argument for silent computation.
names(new_A)[1]=names(data)[h]
u_obs_hat=reg_A$fitted.values
u_mis_hat=predict(reg_A, new_B, "probs")
#-- ks.test --#
# First, compute the p-values of the ks.test for each category of the variable of interest
KolSmir<-rep(NA,ncol(u_obs_hat))
for (i in 1:ncol(u_obs_hat)){
KolSmir[i]<-ks.test(u_obs_hat[,i], u_mis_hat[,i])$p.value
}
# And then --> BH adjustement!
RES=p.adjust(KolSmir, method = "BH", n = ncol(u_obs_hat))
# >0.05 --> accept MCAR
if (any(RES<0.05)==FALSE){
Type.2[h]=0
} else {
Type.2[h]=1
names(U)[1] <- names(data)[h]
}
# Imputation of v_h with respect to the result!
if (any(RES<0.05)==FALSE) {
D1[is.na(data[,h]),h]<-sample(D1[!is.na(data[,h]),h],length(D1[is.na(data[,h]),h]),replace=TRUE)
} else {
U=complete(mice(U, m = 1, maxit=5, printFlag=FALSE))
#printFlag option for silent computation
D1[,h]=U[,1]
}
}
# if the variable is continuous :
names(new_A)[1]="DV"
if (is.factor(new_A$DV)==FALSE) {
reg_A=lm(DV~.,data=new_A)
names(new_A)[1]=names(data)[h]
u_obs_hat=predict(reg_A, new_A) # prediction of x1_obs
u_mis_hat=predict(reg_A, new_B) # prediction of x1_mis
#-- ks.test --#
RES=ks.test(u_obs_hat, u_mis_hat)$p.value
# >0.05 --> accept MCAR
if(RES>0.05) Type.2[h]=0
if(RES<=0.05) Type.2[h]=1
names(U)[1] <- names(data)[h]
### Imputation of v_h with respect to the result!
if (RES>0.05) {
D1[is.na(data[,h]),h]<-sample(D1[!is.na(data[,h]),h],length(D1[is.na(data[,h]),h]),replace=TRUE)
}
if(RES<=0.05) {
U=complete(mice(U, m = 1, printFlag=FALSE))
#printFlag option for silent computation
D1[,h]=U[,1]
}
}
}
Type.2[which(is.na(Type.2))]=-1
k=k+1
TYPE_k[k,]=c(k,Type.2)
Type.km1 <- TYPE_k[k-1,]
if (k==K+1) break
}
names(TYPE_k)[1]<-"k"
abs_nbrMD<-colSums(is.na(data))
rel_nbrMD<-colSums(is.na(data))/(nrow(data))
TYPE.k<-TYPE_k[which(complete.cases(TYPE_k)==TRUE),]
names(TYPE.k)<-c("k",names(data))
type.final<-TYPE_k[nrow(TYPE.k),2:ncol(TYPE.k)]
names(type.final)<-names(data)
iter<-nrow(TYPE.k)
return(list(abs.nbrMD=round(abs_nbrMD,0),rel.nbrMD=round(rel_nbrMD,2),K=K,iter=iter, type.final=type.final, TYPE.k=TYPE.k))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.