#' Main function to calculate stability coefficients
#'
#' @param formula a formula, weight a response to left of ~.
#' @param data Data frame to run models on
#' @param methods Which tree methods to use. Defaults:
#' lm, rpart, tree, ctree, evtree. Also can use "rf" for random forests
#' @param samp.method Sampling method. Refer to caret package trainControl()
#' documentation. Default is repeated cross-validation. Other options
#' include "cv" and "boot".
#' @param tuneLength Number of tuning parameters to try. Applies to train()
#' @param n.rep Number of times to replicate each method
#' @param bump.rep Number of repetitions for bumping
#' @param parallel Whether to run all reps in parallel
#' @param ncore Number of cores to use
#' @param roundVal How much to round cut points when calculating stability
#' @param stablelearner Whether or not to use the stablelearner package to
#' calculate stability
#' @param subset Whether to subset
#' @param perc.sub What fraction of data to put into train dataset. 1-frac.sub
#' is allocated to test dataset. Defaults to 0.75
#' @param weights Optional weights for each case.
#' @importFrom parallel detectCores makeCluster clusterExport parLapply stopCluster
#' @importFrom prodlim row.match
#' @export
#' @examples
#' \dontrun{
#' library(MASS) # for boston data
#' data(Boston)
#' stab.out <- stable(formula=medv ~.,data=Boston,
#' methods=c("rpart"),samp.method="cv",
#' tuneLength=2, n.rep=5, parallel=TRUE)
#' stab.out
#' }
stable = function(formula,
data,
methods=c("lm","rpart","tree","ctree","evtree"),
samp.method="repeatedcv",
tuneLength=3,
n.rep=100,
bump.rep=50,
parallel=FALSE,
ncore=detectCores() - 1,
roundVal = 1,
stablelearner=FALSE,
subset=FALSE,
perc.sub=.75,
weights=NULL){
res <- list()
out <- list()
out2 <- list()
firSplit <- list()
orig.tree <- try(dtree(formula,data,methods,samp.method,
tuneLength,bump.rep,subset,perc.sub,weights,verbose=FALSE))
if(stablelearner==FALSE){
if(parallel==FALSE){
for(i in 1:n.rep){
set.seed(i)
print(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
tt <- try(dtree(formula,data[ids,],methods,samp.method,
tuneLength,bump.rep,subset,perc.sub,weights,verbose=FALSE))
if(inherits(tt, "try-error")){
out[[i]] <- NULL
out2[[i]] <- NULL
firSplit[[i]] <- NULL
}else{
# print(tt)
out[[i]] <- tt
# print(out[[i]]$return.matrix)
out2[[i]] <- out[[i]]$return.matrix
firSplit[[i]] <- out[[i]]$firstSplit
}
}
out <- out[lapply(out,is.null)==FALSE]
out2 <- out2[lapply(out2,is.null)==FALSE]
firSplit <- firSplit[lapply(firSplit,is.null)==FALSE]
ret <- array(NA, dim=c(n.rep,length(methods),7))
for(j in 1:length(out)){
ret[j,,] <- out2[[j]]
}
}else{
data.rep <- list()
for(i in 1:n.rep){
set.seed(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
data.rep[[i]] <- data[ids,]
}
#library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
e <- new.env()
clusterExport(cl, c("formula","methods","samp.method","tuneLength","bump.rep","subset","perc.sub","weights"),envir = e)
par.fun <- function(data){
library(dtree)
tt <- try(dtree(formula,data,methods,samp.method,
tuneLength,bump.rep,subset,perc.sub,weights))
if(inherits(tt, "try-error")){
tt <- NULL
tt
}else{
tt
}
}
out <- list()
out <- parLapply(cl, data.rep,par.fun)
stopCluster(cl)
out <- out[lapply(out,is.null)==FALSE]
out2 <- list()
ret <- array(NA, dim=c(length(out),length(methods),7))
firSplit <- list()
for(i in 1:length(out)){
out2[[i]] <- out[[i]]$return.matrix
firSplit[[i]] <- out[[i]]$firstSplit
ret[i,,] <- out2[[i]]
}
}
ret.mean <- apply(ret,3,colMeans,na.rm=TRUE)
ret.var <- apply(ret,3,matrixStats::colVars,na.rm=TRUE)
ret.mean <- matrix(ret.mean,length(methods),7)
ret.var <- matrix(ret.var,length(methods),7)
stability <- matrix(NA,1,length(methods))
colnames(stability) <- methods
similarity <- matrix(NA,1,length(methods))
colnames(similarity) <- methods
row.names(ret.mean) <- methods
row.names(ret.var) <- methods
colnames(ret.mean) <- colnames(out2[[i]])
colnames(ret.var) <- colnames(out2[[i]])
tt = terms(x=formula,data=data)
preds <- attr(tt,"term.labels")
counts.mean <- matrix(NA,length(methods),length(preds))
counts.var <- matrix(NA,length(methods),length(preds))
rownames(counts.mean) <- rownames(counts.var) <- methods
colnames(counts.mean) <- colnames(counts.var) <- preds
if(any(methods==c("ctree"))){
var.count <- matrix(NA,length(out),length(preds))
colnames(var.count) <- preds
where.ctree <- list()
for(i in 1:length(out)){
hh <- out[[i]]$ctree.splits
hh[,2] <- round(hh[,2],roundVal)
tab <- table(hh[,1])
where.ctree[[i]] <- hh
for(j in 1:length(preds)){
var.count[i,preds[j]] <- tab[preds[j]]
if(is.na(var.count[i,preds[j]]==TRUE)) var.count[i,preds[j]] <- 0
}
}
#sim.mat = data.frame()#matrix(NA,n.rep,length(out[[1]]$ctree.out@get_where()))
# for(i in 1:n.rep){
# if(i ==1){
# sim.mat = out[[i]]$ctree.out@get_where()
# }else{
## sim.mat = data.frame(sim.mat,out[[i]]$ctree.out@get_where())
# }
#sim.mat[i,] = out[[i]]$ctree.out@get_where()
# }
clus.mat = rep(NA,n.rep)
for(i in 1:(n.rep)){
set.seed(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
uni = unique(ids)
boot = data[ids,]
tt = row.match(boot[uni,],data)
clus.mat[i] = clusteval::cluster_similarity(orig.tree$ctree.out@get_where()[tt],
out[[i]]$ctree.out@get_where()[uni])
}
counts.mean["ctree",] <- colMeans(var.count)
counts.var["ctree",] <- round(matrixStats::colVars(var.count),2)
nn <- plyr::ldply(where.ctree)
if(length(unique(nn[,1])) == 1){
res$where.ctree <- table(nn)
}else{
res$where.ctree <- sapply(split(nn, nn$var),table)
}
if(length(unique(where.ctree)) == 1){
stability[,"ctree"] <- 1
}else{
stability[,"ctree"] <- 1-length(unique(where.ctree))/n.rep
}
similarity[,"ctree"] = mean(clus.mat)
}
tt.array <- simplify2array(firSplit)
#print(tt.array[methods,,])
firstSplit <- list()
for(i in 1:length(methods)){
# firstSplit[[methods[i]]] <- table(matrix(tt.array[methods[i],,],length(methods),2)[1,])
firstSplit[[methods[i]]] <- table(tt.array[methods[i],,][1,])
}
if(any(methods==c("rpart"))){
var.count <- matrix(NA,length(out),length(preds))
colnames(var.count) <- preds
where.rpart <- list()
clus.mat = rep(NA,n.rep)
for(i in 1:(n.rep)){
set.seed(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
uni = unique(ids)
boot = data[ids,]
tt = row.match(boot[uni,],data)
clus.mat[i] = clusteval::cluster_similarity(orig.tree$rpart.out$where[tt],
out[[i]]$rpart.out$where[uni])
}
for(i in 1:length(out)){
hh <- out[[i]]$rpart.splits
# if(is.na(hh)){
#
# }
hh[,2] <- round(hh[,2],roundVal)
tab <- table(hh[,1])
where.rpart[[i]] <- hh
for(j in 1:length(preds)){
var.count[i,preds[j]] <- tab[preds[j]]
if(is.na(var.count[i,preds[j]]==TRUE)) var.count[i,preds[j]] <- 0
}
}
counts.mean["rpart",] <- colMeans(var.count)
counts.var["rpart",] <- round(matrixStats::colVars(var.count),2)
nn <- plyr::ldply(where.rpart)
if(length(unique(nn[,1])) == 1){
res$where.rpart <- table(nn)
}else{
res$where.rpart <- sapply(split(nn, nn$var),table)
}
if(length(unique(where.rpart)) == 1){
stability[,"rpart"] <- 1
}else{
stability[,"rpart"] <- 1-length(unique(where.rpart))/length(out)
}
# old stability[,"rpart"] <- 1-length(unique(where.rpart))/length(out)
similarity[,"rpart"] <- mean(clus.mat)
}
if(any(methods==c("evtree"))){
var.count <- matrix(NA,length(out),length(preds))
colnames(var.count) <- preds
where.evtree <- list()
clus.mat = rep(NA,n.rep)
for(i in 1:(n.rep)){
set.seed(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
uni = unique(ids)
boot = data[ids,]
tt = row.match(boot[uni,],data)
clus.mat[i] = clusteval::cluster_similarity(orig.tree$evtree.out$fitted$`(fitted)`[tt],
out[[i]]$evtree.out$fitted$`(fitted)`[uni])
}
for(i in 1:length(out)){
hh <- out[[i]]$evtree.splits
hh[,2] <- round(hh[,2],roundVal)
tab <- table(hh[,1])
where.evtree[[i]] <- hh
for(j in 1:length(preds)){
var.count[i,preds[j]] <- tab[preds[j]]
if(is.na(var.count[i,preds[j]]==TRUE)) var.count[i,preds[j]] <- 0
}
}
counts.mean["evtree",] <- colMeans(var.count)
counts.var["evtree",] <- round(matrixStats::colVars(var.count),2)
nn <- plyr::ldply(where.evtree)
nn[,1] <- as.character(nn[,1])
if(length(unique(nn[,1])) == 1){
res$where.evtree <- table(nn)
}else{
res$where.evtree <- sapply(split(nn, nn$var),table)
}
if(length(unique(where.evtree)) == 1){
stability[,"evtree"] <- 1
}else{
stability[,"evtree"] <- 1-length(unique(where.evtree))/length(out)
}
# stability[,"evtree"] <- 1-length(unique(where.evtree))/length(out)
similarity[,"evtree"] <- mean(clus.mat)
}
if(any(methods==c("ctreePrune"))){
var.count <- matrix(NA,length(out),length(preds))
colnames(var.count) <- preds
where.ctreePrune <- list()
clus.mat = rep(NA,n.rep)
for(i in 1:(n.rep)){
set.seed(i)
ids <- sample(nrow(data),nrow(data),replace=TRUE)
uni = unique(ids)
boot = data[ids,]
#mm = ids %in% uni
#print(mm)
mm = match(uni,ids)
# print(row.names(out[[i]]$ctreePrune.out$fitted))
# print(row.names(orig.tree$ctreePrune.out$fitted))
#print(str(out[[i]]$ctreePrune.out$fitted))
# ids2 = row.names(out[[i]]$ctreePrune.out$fitted)
# mm = row.names(orig.tree$ctreePrune.out$fitted) %in% ids2
# print(mm)
tt = row.match(boot[uni,],data)
#tt = prodlim::row.match(ids2,row.names(orig.tree$ctreePrune.out$fitted))
# print(orig.tree$ctreePrune.out$fitted$`(fitted)`[1:10])
# print(out[[i]]$ctreePrune.out$fitted$`(fitted)`[1:10])
clus.mat[i] = clusteval::cluster_similarity(orig.tree$ctreePrune.out$fitted$`(fitted)`[tt], #mm
out[[i]]$ctreePrune.out$fitted$`(fitted)`[uni])
# print(head(out[[i]]$ctreePrune.out$fitted[uni,],15))
# print(head(orig.tree$ctreePrune.out$fitted[mm,],15))
}
for(i in 1:length(out)){
hh <- out[[i]]$ctreePrune.splits
hh[,2] <- round(hh[,2],roundVal)
tab <- table(hh[,1])
where.ctreePrune[[i]] <- hh
for(j in 1:length(preds)){
var.count[i,preds[j]] <- tab[preds[j]]
if(is.na(var.count[i,preds[j]]==TRUE)) var.count[i,preds[j]] <- 0
}
}
counts.mean["ctreePrune",] <- colMeans(var.count)
counts.var["ctreePrune",] <- round(matrixStats::colVars(var.count),2)
nn <- plyr::ldply(where.ctreePrune)
nn[,1] <- as.character(nn[,1])
if(length(unique(nn[,1])) == 1){
res$where.ctreePrune <- table(nn)
}else{
res$where.ctreePrune <- sapply(split(nn, nn$var),table)
}
if(length(unique(where.ctreePrune)) == 1){
stability[,"ctreePrune"] <- 1
}else{
stability[,"ctreePrune"] <- 1-length(unique(where.ctreePrune))/length(out)
}
# stability[,"ctreePrune"] <- 1-length(unique(where.ctreePrune))/length(out)
similarity[,"ctreePrune"] = mean(clus.mat)
}
if(any(methods==c("bump"))){
var.count <- matrix(NA,length(out),length(preds))
colnames(var.count) <- preds
where.bump <- list()
for(i in 1:length(out)){
hh <- out[[i]]$bump.splits
hh[,2] <- round(hh[,2],roundVal)
tab <- table(hh[,1])
where.bump[[i]] <- hh
for(j in 1:length(preds)){
var.count[i,preds[j]] <- tab[preds[j]]
if(is.na(var.count[i,preds[j]]==TRUE)) var.count[i,preds[j]] <- 0
}
}
counts.mean["bump",] <- colMeans(var.count)
counts.var["bump",] <- round(matrixStats::colVars(var.count),2)
nn <- plyr::ldply(where.bump)
if(length(unique(nn[,1])) == 1){
res$where.bump <- table(nn)
}else{
res$where.bump <- sapply(split(nn, nn$var),table)
}
stability[,"bump"] <- 1-length(unique(where.bump))/length(out)
}
res$n.ret <- length(out)
res$firstSplit <- firstSplit
res$stability <- stability
res$similarity <- similarity
res$counts.mean <- counts.mean
res$counts.var <- counts.var
res$means <- round(ret.mean,3)
res$variances <- round(ret.var,3)
res
}else{
stop("Curently not working")
methods2=methods
if(any(methods2==c("lm","rf"))) stop("only decision tree methods can be used with stable learner")
trees <- dtree(formula,data,methods=methods2,samp.method,
tuneLength,subset,perc.sub,prune,weights)
if(any(methods2==c("rpart"))){
formula2 <- terms(formula,data=data)
tree1 <- rpart(formula(formula2),data)
tree2 <- prune(tree1,cp=as.numeric(trees$rpart.train$bestTune))
# res$rpart <- stablelearner::stabletree(tree2,data=data,B=100)
}
if(any(methods2==c("ctree"))){
# formula2 <- terms(formula,data=data)
#val = 1-as.numeric(trees$ctree.train$bestTune)
#ctrl=ctree_control()#(mincriterion=val)
# treet <- partykit::ctree(formula(formula2),data=data)#,control=ctrl)
# res$ctree <- stablelearner::stabletree(treet,data=data,formula=formula(formula2))
tt = train(formula, data=data,method="ctree")
tree1 <- partykit::ctree(formula, data=data,
control=ctree_control(mincriterion=as.numeric(tt$bestTune)))
# stablelearner::stabletree(tree1)
}
if(any(methods2==c("evtree"))){
tree <- dtree(formula,data,methods="evtree",samp.method,
tuneLength,subset,perc.sub,prune,weights)$evtree.out
#res$evtree <- stablelearner::stabletree(tree,data=data,B=100)
}
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.