#' PKLMtest: compute a p-value for testing MCAR
#'
#' @param X a numeric matrix containing missing values encoded as NA, the data.
#' @param num.proj a positive integer specifying the number of projections to consider for the score.
#' @param num.trees.per.proj a positive integer, the number of trees per projection.
#' @param nrep a positive integer, the number of permutations.
#' @param min.node.size a positive number, the minimum number of nodes in a tree.
#' @param size.resp.set an integer (>= 2), maximum number of classes allowed to be compared in each projection.
#' @param compute.partial.pvals a boolean, indicate if partial p-values shopuld be computed as well.
#' @param ... additional parameters.
#' @import ranger
#' @import parallel
#'
#' @examples
#' n <- 100
#' X <- cbind(rnorm(n),rnorm(n))
#' X.NA <- X
#' X.NA[,1] <- ifelse(stats::runif(n)<=0.2, NA, X[,1])
#'
#' pval <- PKLMtest(X.NA, num.proj = 5)
#'
#' @return a numeric value, the p-value(s) for the MCAR test, the first value is always the global p-value and if compute.partial.pvals is set to TRUE, the next values are the partial p-values for the relative importance of each variable.
#'
#' @export
PKLMtest <- function(X,
num.proj = 300,
num.trees.per.proj = 10,
nrep = 500,
min.node.size = 10,
size.resp.set = 2,
compute.partial.pvals = FALSE,
...) {
# checks
if (num.proj < 2) {
stop("num-proj should be at least 2.")
}
if (size.resp.set < 2) {
stop("size.resp.set should be at least 2.")
}
# simplification
#size.resp.set <- size.resp.set - 1
# checks
if (!is.matrix(X)) {
stop("X should be a matrix.")
}
# get the mask for the data provided in X
M <- is.na(X)
# copute the permutations of the missing value patterns
Mperm <- lapply(1:nrep, function(j) M[sample(nrow(M)),])
# list of results
list.obs <- list()
list.null <- list()
list.var.cov <- list()
list.var.resp <- list()
# main loop
while (length(list.obs) != num.proj) {
# sample a projection for the response
new.var <- c()
num.classes <- 1
while (length(new.var)==0 || num.classes <= 1 ) {
# sample a projection for the covariates
size.cov <- sample(1:ncol(X), size = 1, replace = FALSE)
var.cov <- sample(1:ncol(X), size = size.cov, replace = FALSE)
new.var <- setdiff(1:ncol(X), var.cov)
if (length(new.var)>1) {
if (size.resp.set=="random"){
size.resp <- sample(1:length(new.var), size = 1, replace = FALSE)
}else{
size.resp<- sample(1:min(length(new.var), size.resp.set), size=1, replace=FALSE)
}
var.resp <- sample(new.var, size = size.resp, replace = FALSE)
} else if (length(new.var)==1) {
var.resp <- new.var
} else if (length(new.var)==0){
var.resp <- var.cov
}
# which one are complete there
ids.keep <- which(apply(X[,var.cov,drop=FALSE],
1, function(x) !any(is.na(x))))
# define the new response
M.resp <- M[ids.keep,var.resp,drop=F]
unique.pat.resp <- unique(M.resp)
num.classes<- nrow(unique(M.resp))
}
list.var.cov[[length(list.var.cov)+1]] <- var.cov
list.var.resp[[length(list.var.resp)+1]] <- var.resp
ids.patterns.resp <- sapply(1:nrow(unique.pat.resp),
function(i) which(apply(M.resp,
1,
function(m) identical(m,
unique.pat.resp[i,]))), simplify = F)
# get the data for the projection
X.proj <- X[ids.keep,var.cov,drop=F]
# create the class for the projection
class.resp <- rep(NA, length(ids.keep))
for (i in 1:length(ids.patterns.resp)) {
class.resp[ids.patterns.resp[[i]]] <- i
}
class.resp.perm<-lapply(1:nrep, FUN=function(j){
M.respj <- Mperm[[j]][ids.keep,var.resp,drop=F]
unique.pat.respj <- unique(M.respj)
num.classesj<- nrow(unique(M.respj))
ids.patterns.resp <- sapply(1:nrow(unique.pat.respj),
function(i) which(apply(M.respj,
1,
function(m) identical(m,
unique.pat.respj[i,]))), simplify = F)
# create the class for the projection
class.respj <- rep(NA, length(ids.keep))
for (i in 1:length(ids.patterns.resp)) {
class.respj[ids.patterns.resp[[i]]] <- i
}
return(class.respj)
} )
d <- data.frame(y = factor(class.resp),
X = X.proj)
st <- tryCatch({ranger::ranger(y~., data = d,
num.trees = num.trees.per.proj,
classification = TRUE,
min.node.size = min.node.size,
probability = TRUE)
}, error = function(e) NA)
if(!any(is.na(st))) {
# generate the observed stat
obs <- genU(st, lab = class.resp)
# generate the stat under H0
null <- sapply(class.resp.perm, function(p) genU(st, lab = p))
list.obs[[length(list.obs)+1]] <- obs
list.null[[length(list.null)+1]] <- null
}else{
list.var.cov <- list.var.cov[-length(list.var.cov)]
list.var.resp <- list.var.cov[-length(list.var.resp)]
}
}
print("while done")
stat.perm.raw <- Reduce(list.null,f = rbind)
isIn <- sapply(1:ncol(X), function(i) sapply(list.var.resp, function(l) !(i %in% l))) # list.var.resp
isIn <- cbind(rep(TRUE, num.proj ), isIn)
if (!compute.partial.pvals) {
isIn <- isIn[,1,drop=FALSE]
}
partial.perm <- apply(isIn ,2, function(x) {
ids <- which(x)
colMeans(stat.perm.raw[ids,,drop=FALSE], na.rm=T)
})
if (ncol(isIn)==1) {
partial.perm <- matrix(partial.perm, nrow=1)
}
partial.obs <- apply(isIn ,2, function(x) {
if (sum(x)==0) {
return(NA)
} else {
ids <- which(x)
return(mean(unlist(list.obs[ids]),
na.rm=T))
}
})
##### partial p-values #########
if (compute.partial.pvals){
pvals <- sapply(1:(ncol(X)+1), function(i) (sum(partial.perm[,i] >= partial.obs[i])+1)/(length(partial.perm[,i])+1))
names(pvals) <- c("all", paste0("-",1:ncol(X)))
}else{
pvals<-(sum(partial.perm >= partial.obs)+1)/(length(partial.perm)+1)
names(pvals) <- "all"
}
##############################
return(pvals)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.