R/sid_main_algorithm.R

Defines functions sid_prune_singletons sid_prune_tree sid_prune_treelist sid_filter_trees sid_p sid_p_tree_es sid_p_ci sid_p_tree sid_group sid_part sid_prune_group sid_untreeify sid_treeify sid_halve sid_sort sid_find_singleton sid_tree_goodness sid_randomize_dataset sid_gen_surrogate sid_singletonize sid_get_goodness defactor

Documented in defactor sid_filter_trees sid_find_singleton sid_gen_surrogate sid_get_goodness sid_group sid_halve sid_p sid_part sid_p_ci sid_prune_group sid_prune_singletons sid_prune_tree sid_prune_treelist sid_p_tree sid_p_tree_es sid_randomize_dataset sid_singletonize sid_sort sid_tree_goodness sid_treeify sid_untreeify

#' Defactor a numeric.
#'
#' Defactor a numeric factor to a true numeric.
#'
#' @param x A numeric factor to convert to a true numeric.
#'
#' @return A numeric.
#'
#' @export
defactor <- function(x) {
    as.numeric(as.character(x))
}


#' Get the goodness (accuracy) of a classifier.
#'
#' Get the goodness (accuracy) of a classifier.
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#'
#' @return The accuracy of the classifier.
#'
#' @export
sid_get_goodness <- function(data_train, data_test, classifier = svm) {
    model  <- classifier(class ~., data = data_train)
    cl_tmp <- predict(model, newdata = data_test)
    sum(data_test$class == cl_tmp) / length(cl_tmp)
}


#' Singletonize the attributes in the vector x.
#'
#' Singletonize the attributes in the vector x.
#'
#' @param x A vector with attribute numbers.
#' @param cl_id The number of the class attribute.
#'
#' @return A list where the attributes in x are singletons.
#'
#' @export
sid_singletonize <- function(x, cl_id) {
    lapply(x, function(i) c(cl_id, i))
}


#' Generate a surrogate (permuted) dataset.
#'
#' Generate a surrogate (permuted) dataset.
#'
#' @param tree The parametrisation of the permutation.
#' @param data_orig The data to permute.
#'
#' @return The permutation of data_orig using tree.
#'
#' @export
sid_gen_surrogate <- function(tree, data_orig) {
    perm  <- permutation(tree, data = data_orig)
    permutedata(data_orig, perm)
}


#' Convenience function for randomising the data inside a dataset.
#'
#' A convenience function to generate a dataset randomized using a particular tree.
#'
#' @param tree The parametrisation of the permutation.
#' @param dataset A dataset containing the data as (dataset$data).
#'
#' @return The permutation of data_orig using tree.
#'
#' @export
sid_randomize_dataset <- function(tree, dataset) {
    dataset$data <- sid_gen_surrogate(tree, dataset$data)
    dataset
}


#' Get the goodness (accuracy) of a tree.
#'
#' Get the goodness (accuracy) of a tree.
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 10.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#' @param stats Return statistics for the tree. Default is \code{FALSE}.
#'
#' @return The average goodness.
#'
#' @export
sid_tree_goodness <- function(data_train, data_test, tree, classifier, R = 10, parallel = TRUE, stats = FALSE) {
    ## ----------
    ## make a call to rnorm to consume some randomness
    rnorm(1)
    ## ----------

    if (parallel) {
        out <- simplify2array(mclapply(seq.int(R),
                                       function(i) sid_get_goodness(classifier = classifier, data_train = sid_gen_surrogate(tree, data_train), data_test = data_test),
                                       mc.cores = (detectCores() - 1))
                              )
    }
    else {
        out <- replicate(R,
                         sid_get_goodness(classifier = classifier, data_train = sid_gen_surrogate(tree, data_train), data_test = data_test)
                         )
    }

    if (stats) {
        list("mean" = mean(out), "range" = c(min(out), max(out)), "sd" = sd(out), "median" = median(out), "raw" = out)
    } else {
        mean(out)
    }
}


#' Find attribute to remove from a tree.
#'
#' Testing all possibilities, find the attribute to remove from the
#' grouping that impacts the goodness the least.
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param g_ut List of untested singletons.
#' @param g_s List with already removed attributes as singletons.
#' @param id_cl The number of the class attribute.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 10.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The attribute to remove.
#'
#' @export
sid_find_singleton <- function(data_train, data_test, g_ut, g_s, id_cl, R = 10, classifier, parallel = TRUE) {
    x <- sapply(g_ut, function(i) {
        tree   <- c(list(c(id_cl, setdiff(g_ut, i))), sid_singletonize(c(g_s, i), id_cl))
        sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = R, parallel = parallel)
    })
    g_ut[which.max(x)]
}


#' Rank attributes
#'
#' After ranking, the attributes are ordered so that those belonging
#' to the same group are consecutive.
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param R Number of replications. Default is 10.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The attributes, sorted.
#'
#' @export
sid_sort <- function(data_train, data_test, R = 10, classifier = svm, parallel = TRUE) {
    id_cl   <- which(names(data_train) == "class")
    n_attr  <- ncol(data_train) - 1

    g_ut <- seq.int(n_attr)
    g_s  <- c()

    for (i in seq.int(n_attr)) {
        s    <- sid_find_singleton(data_train, data_test, g_ut, g_s, id_cl, R, classifier, parallel = parallel)
        g_ut <- setdiff(g_ut, s)
        g_s  <- c(g_s, s)
    }
    g_s
}


#' Calculate goodness for 2-group attribute sets.
#'
#' fter ranking, the attributes are ordered so that those belonging to
#' the same group are consecutive. Now calculate the accuracy for each
#' grouping formed by splitting this sorted list of attributes into a
#' tree with 2 groups, for each possible split (number of attributes -
#' 1).
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param attr_rank Sorted list of attributes (from \code{sid_sort}).
#' @param R Number of replications. Default is 10.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The attributes, sorted.
#'
#' @export
sid_halve <- function(data_train, data_test, attr_rank, R = 10, classifier = svm, parallel = parallel) {
    id_cl  <- which(names(data_train) == "class")
    n_attr <- length(attr_rank)

    sapply(seq.int(n_attr-1), function(i) {
        g_l   <- attr_rank[1:i]
        g_r   <- attr_rank[(i+1):n_attr]
        tree  <- c(list(c(id_cl, g_l), c(id_cl, g_r)))
        sid_tree_goodness(data_train, data_test, tree, R = R, classifier = classifier, parallel = parallel)
    })
}


#' Create a valid tree from a list of attribute indices.
#'
#' Create a valid tree from a list of attribute indices by adding the
#' index of the class attribute to each group in the tree.
#'
#' @param tree The tree.
#' @param data The data with the attributes.
#'
#' @return A valid tree.
#'
#' @export
sid_treeify <- function(tree, data) {
    id_cl <- which(names(data) == "class")
    sapply(tree, function(i) unique(c(id_cl, i)), simplify = FALSE)

}


#' Undo treeification.
#'
#' Remove the class index from each group in the tree.
#'
#' @param tree The tree.
#' @param data The data with the attributes.
#'
#' @return A valid tree.
#'
#' @export
sid_untreeify <- function(tree, data) {
    id_cl <- which(names(data) == "class")
    sapply(tree, function(i) setdiff(i, id_cl), simplify = FALSE)
}


#' Prune group.
#'
#' Prune group (singleton or regular group).
#'
#' @param tree The tree.
#' @param data The data with the attributes.
#' @param pg The group to prune.
#'
#' @return A valid tree.
#'
#' @export
sid_prune_group <- function(tree, data, pg) {
    id_cl     <- which(names(data) == "class")
    for (i in pg)
        tree[[i]] <- c(0, setdiff(tree[[i]], id_cl))
    tree
}


#' Partition an array into two.
#'
#' Partition an array x into two lists at the index si.
#'
#' @param x An array.
#' @param si Split index.
#'
#' @return A list with the left and right parts of x split at the
#'     index si.
#'
#' @export
sid_part <- function(x, si) {
    k <- length(si) + 1
    y <- rep(k, length(x))

    for (i in rev(si)) {
        k      <- k - 1
        y[1:i] <- k
    }

    out        <- split(x, y)
    names(out) <- NULL
    out
}

#' Find a grouping from a sorted list of attributes.
#'
#' Find the best grouping from a sorted list of attributes and the goodness.
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param attr_rank Sorted list of attributes (from \code{sid_sort}).
#' @param R Number of replications. Default is 10.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return A list with the left and right parts of x split at the
#'     index si.
#'
#' @export
sid_group <- function(data_train, data_test, attr_rank, R = 10, classifier = svm, parallel = TRUE) {
    g_vec    <- vector(mode = "numeric", length = length(attr_rank))
    tree_vec <- vector(mode = "list", length = length(attr_rank))

    split_acc  <- sid_halve(data_train, data_test, attr_rank, R = R, classifier = classifier, parallel = parallel)
    split_rank <- order(split_acc, decreasing = TRUE)

    ## Find the optimal split and the associated trees
    for (k in seq.int(from = 2, to = length(attr_rank))) {
        ## Optimal split indices for the k-grouping given by the top (k-1) splits
        si <- sort(split_rank[1:(k-1)])

        tree_cl      <- sid_treeify(sid_part(attr_rank, si), data_train)
        goodness_tmp <- sid_tree_goodness(data_train, data_test, tree_cl, R = R, classifier = classifier, parallel = parallel)

        tree_vec[[k]] <- list("tree" = tree_cl, "goodness" = goodness_tmp, "k" = k)
        g_vec[k]      <- goodness_tmp
    }

    list("goodness" = g_vec[-1], "trees" = tree_vec[-1])
}


#' Calculate the p-value for a tree.
#'
#' Calculate the p-value for a tree (grouping).
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param tree The tree
#' @param g0 The Baseline goodness, if known. Can be \code{NULL} (default).
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 100.
#' @param histogram Plot histogram (Boolean, default is \code{FALSE}.
#' @param return_raw Return raw data used in the p-value calculation (Boolean, default is \code{FALSE}.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The p-value for the tree
#'
#' @export
sid_p_tree <- function(data_train, data_test, tree, g0 = NULL, classifier = svm, R = 100, histogram = FALSE, parallel = TRUE, return_raw = FALSE) {
    ## ----------
    ## make a call to rnorm to consume some randomness
    rnorm(1)
    ## ----------

    ## Calculate the original goodness if it is not given
    if (is.null(g0))
        g0 <- sid_get_goodness(data_train, data_test, classifier = classifier)

    if (parallel) {
        d <- simplify2array(mclapply(seq.int(R), function(i) sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = 1, parallel = FALSE), mc.cores = (detectCores() - 1)))
    }
    else {
        d <- replicate(R, sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = 1, parallel = FALSE))
    }

    nu <- sum(d >= g0)
    p <- (1 + nu) / (1 + R)

    if (histogram) {
        x11()
        hist(d, xlim = c(0.5, 1), main = paste0(paste0("p = ", round(p, 5)), "\n", gsub("c", "", paste0(tree, collapse = " "))))
        abline(v = g0, col = "blue", lwd = 5, lty = 3)
    }
    if (return_raw) {
        list("d" = d, "p" = p, "g0" = g0)
    }  else {
        p
    }
}


#' Binomial confidence interval for the p-value
#'
#' binomial confidence interval for the p-value using normal approximation
#'
#' @param p Proportion of successes
#' @param n Number of samples
#' @param z Parameter for confidence band. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#'
#' @return Confidence interval for the p-value
#'
#' @export
sid_p_ci <- function(p, n, z = 2.57) {
    a <- z * sqrt((1/n)*p*(1-p))
    p + c(-a, a)
}


#' Calculate the p-value for a tree (using early stopping)
#'
#' Calculate the p-value for a tree (grouping).
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param tree The tree
#' @param g0 The Baseline goodness, if known. Can be \code{NULL} (default).
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param Rmin Ninimum number of replications. Default is 250.
#' @param Rmax Maximum number of replications. Default is 500.
#' @param return_raw Return raw data used in the p-value calculation (Boolean, default is \code{FALSE}.
#' @param alpha The confidence level at which we are evaluating statistical significance of the tree.
#' @param z Parameter for confidence band. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The p-value for the tree
#'
#' @export
sid_p_tree_es <- function(data_train, data_test, tree, g0 = NULL, classifier = svm, Rmin = 250, Rmax = 500, parallel = TRUE, return_raw = FALSE, alpha = 0.05, z = 2.57) {
    ## ----------
    ## make a call to rnorm to consume some randomness
    rnorm(1)
    ## ----------

    ## Calculate the original goodness if it is not given
    if (is.null(g0))
        g0 <- sid_get_goodness(data_train, data_test, classifier = classifier)

    if (parallel) {
        d <- simplify2array(mclapply(seq.int(Rmin), function(i) sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = 1, parallel = FALSE), mc.cores = (detectCores() - 1)))
    }
    else {
        d <- replicate(Rmin, sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = 1, parallel = FALSE))
    }

    ## check the current confidence interval
    ns <- sum(d >= g0)
    n <- Rmin
    p <- (1 + ns) / (1 + Rmin)

    ## If the band does not cover alpha we can stop here,
    ## otherwise sample until
    ##    -- (1) Rmax or
    ##    -- (2) until band does not cover alpha

    dvec <- d
    for (i in seq.int(Rmax-Rmin)) {
        ## if (! (alpha >= sid_p_ci(p, n+1)[1] && alpha <= sid_p_ci(p, n+1)[2])) {
        ## (1) Stop if the upper limit for the band is lower than alpha --> alpha cannot be inside the band
        ## (2) Also stop if the lower band is higher than alpha --> alpha will always be in the band
        ci_tmp <- sid_p_ci(p, n+1, z)
        if ( (ci_tmp[2] < alpha) || (ci_tmp[1] >= alpha) ) {
            break
        }

        d <- sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = 1, parallel = FALSE)
        dvec <- c(dvec, d)
        n <- n + 1
        if (d >= g0) {
            ns <- ns + 1
        }
        p <- (1 + ns) / (1 + n)
        ## debug printing
        ##   cat(n, "\t", sid_p_ci(p, n+1), "\t", p, "\n")
    }

    ##    cat("done!", "\t", n, "\t", sid_p_ci(p, n+1), "\n")

    if (return_raw) {
        list("d" = dvec, "p" = p, "g0" = g0)
    }  else {
        p
    }
}


#' Calculate the p-values for a list of trees.
#'
#' Calculate the p-value for a list of trees (groupings).
#'
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param tree_list List of trees.
#' @param g0 The Baseline goodness, if known. Can be \code{NULL} (default).
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param Rmin Ninimum number of replications. Default is 250.
#' @param Rmax Maximum number of replications. Default is 500.
#' @param alpha The confidence level at which we are evaluating statistical significance of the tree.
#' @param z Parameter for confidence band. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#' @param early_stopping Use early stopping. Default is \code{FALSE} in which case Rmin samples are used to calculate p-values. If \code{TRUE} at least Rmin and at most Rmax values are used.
#' '
#' @return The treelist with the p-values added.
#'
#' @export
sid_p <- function(data_train, data_test, tree_list, g0 = NULL, classifier = svm, Rmin = 250, Rmax = 500, alpha = 0.05, z = 2.57, parallel = TRUE, early_stopping = FALSE) {
    if (is.null(g0))
        g0 <- sid_get_goodness(data_train, data_test, classifier = classifier)

    for (i in seq.int(length(tree_list))) {
        if (early_stopping) {
            ## -- with early stopping --
            ## tree_list[[i]][["p"]]
            tmp <- sid_p_tree_es(data_train, data_test, tree = tree_list[[i]][["tree"]], g0 = g0, classifier = classifier, Rmin = Rmin, Rmax = Rmax, z = z, alpha = alpha, parallel = parallel, return_raw = TRUE)
        } else {
            ## -- without early stopping --
            ## tree_list[[i]][["p"]]
            tmp <- sid_p_tree(data_train, data_test, tree = tree_list[[i]][["tree"]], g0 = g0, classifier = classifier, R = Rmin, parallel = parallel, return_raw = TRUE)
        }

        ## store the p-value
        tree_list[[i]][["p"]] <- tmp$p

        acc_stats        <- list()
        acc_stats$mean   <- mean(tmp$d)
        acc_stats$median <- median(tmp$d)
        acc_stats$range  <- c(min(tmp$d), max(tmp$d))
        acc_stats$g0     <- g0
        acc_stats$sd     <- sd(tmp$d)
        acc_stats$raw    <- tmp$d
        acc_stats$p      <- tmp$p
        acc_stats$ci     <- compute_ci(acc_stats$raw, alpha = 100 * alpha)
        
        ## Add statistics on the tree
        tree_list[[i]][["acc_stats"]] <- acc_stats
    }

    tree_list

}


#' Find trees that are acceptable at the alpha significance level.
#'
#' Find trees that are acceptable at the alpha significance level.
#'
#' @param tree_list List of trees.
#' @param alpha Significance level (default is 0.05).
#' @param max_k Boolean. If \code{TRUE} (default) return only the tree of maximum size.
#'
#' @return List of accetable trees of the maximum size tree.
#'
#' @export
sid_filter_trees <- function(tree_list, alpha = 0.05, max_k = TRUE) {
    id        <- unlist(sapply(seq.int(length(tree_list)), function(i) if (tree_list[[i]][["p"]] < alpha) i))

    if (! is.null(id))
        tree_list <- tree_list[-id]

    if (max_k)
        if (length(tree_list) > 1)
            tree_list <- tree_list[which.max(sapply(tree_list, function(i) i[["k"]]))]

    tree_list
}


#' Prune all trees in the treelist.
#'
#' Prune all trees in the treelist.
#'
#' @param treelist List of trees.
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 100..
#' @param alpha Significance level (default is 0.05).
#' @param Rmin Ninimum number of replications for calculating p-values. Default is 250.
#' @param Rmax Maximum number of replications for calculating p-values. Default is 500.
#' @param z Parameter for confidence band. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return Pruned treelist
#'
#' @export
sid_prune_treelist <- function(treelist, data_train, data_test, classifier = svm, R = R, Rmin = 250, Rmax = 500, z = 2.57, alpha = 0.05, parallel = TRUE) {
    tmp <- lapply(treelist,
                  function(i) sid_prune_singletons(i, data_train, data_test, R = Rmin, classifier = classifier, alpha = alpha, parallel = parallel)
                  )

    valid_vals <- which(sapply(tmp, function(i) i$p) >= alpha)
    treelist[valid_vals] <- tmp[valid_vals]

    treelist
}


#' Prune tree.
#'
#' Prune tree.
#'
#' @param tree_res A tree results structure.
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 100.
#' @param alpha Significance level (default is 0.05).
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The pruned tree results structure.
#'
#' @export
sid_prune_tree <- function(tree_res, data_train, data_test, classifier = svm, R = 100, alpha = 0.05, parallel = TRUE) {
    id_cl <- which(names(data) == "class")
    tree0 <- sid_treeify(tree_res[["tree"]], data_train)
    ng    <- length(tree0)

    gp   <- c()

    for (i in seq.int(ng)) {
        tree <- sid_prune_group(tree0, data_train, c(gp, i))
        p    <- sid_p_tree(data_train, data_test, tree, g0 = NULL, classifier = classifier, R = R, parallel = parallel)

        if (p >= alpha)
            gp <- c(gp, i)
    }

    if (length(gp) >= 1) {
        tree_res[["tree"]]     <- sid_prune_group(tree0, data_train, gp)
        tree_res[["goodness"]] <- sid_tree_goodness(data_train, data_test, tree_res[["tree"]], classifier = classifier, R = R)
        tree_res[["p"]]        <- sid_p_tree(data_train, data_test, tree_res[["tree"]], g0 = NULL, classifier = classifier, R = R)

    }

    tree_res
}




#' Prune singletons from tree.
#'
#' Prune singletons from tree.
#'
#' @param tree_res A tree results structure.
#' @param data_train Training data.
#' @param data_test Testing data.
#' @param classifier A classifier function, default is svm (from the e1071 package).
#' @param R Number of replications. Default is 100.
#' @param Rmin Ninimum number of replications for calculating p-values. Default is 250.
#' @param Rmax Maximum number of replications for calculating p-values. Default is 500.
#' @param z Parameter for confidence band. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param alpha Significance level (default is 0.05).
#' @param parallel Calculate in parallel (Boolean, default is \code{TRUE}).
#'
#' @return The pruned tree results structure.
#'
#' @export
sid_prune_singletons <- function(tree_res, data_train, data_test, classifier = svm, R = 100, Rmin = 250, Rmax = 500, z = 2.57, alpha = 0.05, parallel = TRUE) {
    id_cl <- which(names(data) == "class")
    tree0 <- sid_treeify(tree_res[["tree"]], data_train)

    ## store indices of singletons in g_ut
    g_ut <- which(sapply(tree0, function(i) length(i)) == 2)

    ## empty list to store pruned singletons
    gp   <- c()

    ## Greedily find the singleton to remove that impacts accuracy the least
    ## Continue as long as p >= alpha

    ## -------------------------------

    for (i in seq.int(length(g_ut))) {
        ## calculate accuracy for all the trees
        x <- sapply(g_ut, function(i) {
            tree <- sid_prune_group(tree0, data_train, c(gp, i))
            sid_tree_goodness(data_train, data_test, tree, classifier = classifier, R = R, parallel = parallel)
        })
        singleton_candidate <- g_ut[which.max(x)]

        ## calculate p-value of the tree
        tree <- sid_prune_group(tree0, data_train, c(gp, singleton_candidate))
        p    <- sid_p_tree_es(data_train, data_test, tree, g0 = NULL, classifier = classifier, Rmin = Rmin, Rmax = Rmax, z = z, alpha = alpha, parallel = parallel)

        if (p >= alpha) {
            gp <- c(gp, singleton_candidate)
            g_ut <- setdiff(g_ut, singleton_candidate)
        } else {
            break
        }

        if (length(gp) >= 1) {
            tree_res[["tree"]]     <- sid_prune_group(tree0, data_train, gp)
            tree_res[["goodness"]] <- sid_tree_goodness(data_train, data_test, tree_res[["tree"]], classifier = classifier, R = R, parallel = parallel)
            tree_res[["p"]]        <- sid_p_tree_es(data_train, data_test, tree_res[["tree"]], g0 = NULL, classifier = classifier, Rmin = Rmin, Rmax = Rmax, z = z, alpha = alpha, parallel = parallel)
        }
    }
    tree_res


}
bwrc/astrid-r documentation built on June 24, 2017, 8:05 p.m.