astrid-r: Automatic STructure IDentification

#' Read UCI data set stored in arff format and transform all columns
#' to factors.
#'
#' @param dataset The name of the dataset without file suffix.
#' @param datapath The path to the directory containing the datasets,
#'                 with trailing slash.
#' @param factorize Should the columns be transformed to
#'                  factors. Default is FALSE
#' @param na.omit Omit NAs. Default is TRUE.
#' @return The classification accuracy.
#'
#' @export
read_uci_dataset <- function(datapath, dataset, factorize = FALSE, na.omit = TRUE, remove_constant = TRUE) {

    data <- read.arff(file.path(datapath, paste0(dataset, ".arff")))

    if (na.omit)
        data <- na.omit(data)

    if (remove_constant) {
        ind_const <- as.numeric(which(sapply(data[-ncol(data)], function(i) length(unique(i))) == 1))
        if (length(ind_const) > 0)
            data <- data[,-ind_const]

    }

    colnames(data)[ncol(data)] <- "class"

    if(factorize) {
        for(i in colnames(data)) {
            data[,i] <- factor(data[,i])
        }
    }

    names(data) <- gsub("-", "_", names(data))
    names(data) <- gsub(" ", "_", names(data))
    names(data) <- gsub("[?]", "", names(data))
    names(data) <- gsub("/", "", names(data))

    list("name" = dataset, "data" = data)
}


#' Get properties of a dataset.
#'                                       #
#' Get properties of a dataset.
#'
#' @param dataset The dataset (a list).
#'
#' @return The dataset structure with a new list element \code{properties} added containing properties of the dataset.
#'
#' @export
get_dataset_properties <- function(dataset) {

    dataset[["properties"]] <- list("name"          = dataset$name,
                                    "names"         = names(dataset$data)[-ncol(dataset$data)],
                                    "n_classes"     = length(unique(dataset$data$class)),
                                    "n_attributes"  = ncol(dataset$data) - 1,
                                    "n"             = nrow(dataset$data),
                                    "major_class_p" = max(table(dataset$data$class)) / nrow(dataset$data))
    dataset

}


#' Split dataset into training and testing (50/50).
#'                                       #
#' Split dataset into training and testing (50/50).
#'
#' @param dataset The dataset
#' @param classname the name of the class in the dataset. Default is \code{class}.
#'
#' @return The dataset structure with two new elements (\code{data_train} and \code{data_test}).
#'
#' @export
split_dataset <- function(dataset, classname = "class") {
    ind                     <- testsplit(dataset$data, stratify = classname)
    dataset[["data_train"]] <- dataset$data[ind, ]
    dataset[["data_test"]]  <- dataset$data[-ind, ]

    dataset
}


#' Split dataset into 3 parts: training, testing, validation (50/25/25).
#'                                       #
#' Split dataset into 3 parts: training, testing, validation (50/25/25).
#'
#' @param dataset The dataset.
#' @param classname the name of the class in the dataset. Default is \code{class}.
#'
#' @return The dataset structure with two new elements (\code{data_train}, \code{data_test} and \code{data_validation}).
#'
#' @export
split_dataset_ttv <- function(dataset, classname = "class") {

    tmp <- dataset$data
    ind_train <- testsplit(tmp, stratify = classname, fraction = 1/2)
    dataset[["data_train"]] <- tmp[ind_train,]

    tmp <- tmp[-ind_train,]
    ind_test <- testsplit(tmp, stratify = classname, fraction = 1/2)
    dataset[["data_test"]] <- tmp[ind_test,]

    dataset[["data_validation"]] <- tmp[-ind_test,]

    dataset

}


#' List datasets (arff-files) in a given path.
#'                                       #
#' List datasets (arff-files) in a given path.
#'
#' @param datapath A path.
#'
#' @return List of arff-files.
#'
#' @export
list_datasets <- function(datapath) {
    flist <- list.files(datapath, pattern = "*.arff")
    gsub(".arff", "", flist)

}


#' List datasets (arff-files) in a given path sorted by their number of attributes.
#'                                       #
#' List datasets (arff-files) in a given path sorted by their number of attributes.
#'
#' @param datapath A path.
#'
#' @return List of arff-files (sorted).
#'
#' @export
list_datasets_sorted <- function(datapath) {
    dataset_list <- list_datasets(datapath)
    n_attr_list  <- sapply(dataset_list, function(i) get_dataset_properties(read_uci_dataset(datapath, i))$properties$n_attributes)
    list("names" =  dataset_list[order(n_attr_list)],
         "n_attributes" = n_attr_list[order(n_attr_list)])
}


#' Analyze the given dataset.
#'
#' Find the attribute interactions in the given dataset.
#'
#' @param dataset The dataset to analyze
#' @param classname The name of the class attribute in the dataset. Default is \code{class}.
#' @param seed Random seed, default is 42.
#' @param seed_traintest Random seed used for splitting the data into training and testing sets, default is 42.
#' @param classifier The classifier to be used, as a string. Default is \code{NULL}.
#' @param alpha Significance level (default is 0.05).
#' @param R Number of samples to use for calculating the accuracy. Default is R = 1 + (1 / alpha).
#' @param Rmin Ninimum number of replications to use for calculating p-values. Default is 250.
#' @param Rmax Maximum number of replications to use for calculating p-values. Default is 500.
#' @param z Parameter for confidence band for p-values. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param prune_singletons Should singletons be pruned. Default is \code{TRUE}.
#' @param parallel Calculate p-values in parallel (Boolean, default is \code{TRUE}). If parallel is used the results are not deterministic using the same random seed.
#' @param early_stopping Use early stopping. Default is \code{FALSE} in which case Rmin samples are used to calculate p-values. If \code{TRUE} at least Rmin and at most Rmax values are used.
#'
#' @return A results strucure (list with fields).
#'
#' @export
analyze_dataset <- function(dataset, classname = "class", seed_traintest = 42, seed = 42, classifier = NULL, alpha = 0.05, R = (1 + ceiling(1 / alpha)), Rmin = 250, Rmax = 500, z = 2.57, prune_singletons = TRUE, parallel = TRUE, early_stopping = FALSE) {
    if (is.null(classifier))
        stop("No classifier specified!")

  
    classifier_str <- classifier
    classifier     <- get(classifier_str)

    ## Set the random seed
    if (parallel) {
        RNGkind("L'Ecuyer-CMRG")
    }
    
   
    ## Set dataset properties
    dataset <- get_dataset_properties(dataset)

    ## Set the random seed for splitting into training and testing
    cat("Using a train-test random seed of: ", seed_traintest, "\n")
    set.seed(seed_traintest)

    ## Split the dataset into training, testing and validation
    ## dataset <- split_dataset(dataset, classname = classname)
    dataset <- split_dataset_ttv(dataset, classname = classname)

    ## Set random seed for the rest of the algorithm
    cat("Using a random seed of: ", seed, "\n")
    set.seed(seed)

    
    ## Start timer
    timelist <- list()
    t_start <- Sys.time()

    ## --------------------------------------------------
    ## Perform the analysis
    ## --------------------------------------------------
    ## (1) Sorting step
    cat("Running sorting step.")
    t_tmp                <- Sys.time()
    attributes_sorted    <- sid_sort(dataset$data_train, dataset$data_test, R = R, classifier = classifier, parallel = parallel)
    timelist[["t_sort"]] <- as.numeric(difftime(Sys.time(), t_tmp, units = "secs"))
    cat("\t[DONE]\n")

    ## (2) Grouping step
    cat("Running grouping step.")
    t_tmp                  <- Sys.time()
    res_tree               <- sid_group(dataset$data_train, dataset$data_test, attributes_sorted, R = R, classifier = classifier, parallel = parallel)
    timelist[["t_group"]]  <- as.numeric(difftime(Sys.time(), t_tmp, units = "secs"))
    cat("\t[DONE]\n")
    
    ## (3) Testing step
    cat("Running testing step.")
    t_tmp             <- Sys.time()
    ## res_tree_p        <- sid_p(dataset$data_train, dataset$data_test, res_tree[["trees"]], R = R, classifier = classifier)

    res_tree_p        <- sid_p(dataset$data_train, dataset$data_validation, res_tree[["trees"]], classifier = classifier, Rmin = Rmin, Rmax = Rmax, z = z, alpha = alpha, parallel = parallel, early_stopping = early_stopping)
    timelist[["t_p"]] <- as.numeric(difftime(Sys.time(), t_tmp, units = "secs"))
    cat("\t[DONE]\n\n")

    ## Only keep trees that are "valid" (i.e., no sig diff)
    res_tree_p_filtered <- sid_filter_trees(res_tree_p, alpha = alpha, max_k = FALSE)

    if (prune_singletons) {
        ## Prune the final tree, if possible
        res_tree_p_pruned          <- NULL
        res_tree_p_pruned_filtered <- NULL
        timelist[["t_pruned"]]     <- NULL

        if (length(res_tree_p_filtered) > 0) {
            t_tmp                      <- Sys.time()

            res_tree_p_pruned          <- sid_prune_treelist(res_tree_p_filtered, dataset$data_train, dataset$data_validation, R = R,  Rmin = Rmin, Rmax = Rmax, alpha = alpha, z = z, classifier = classifier, parallel = parallel)
            res_tree_p_pruned_filtered <- sid_filter_trees(res_tree_p_pruned, alpha = alpha, max_k = FALSE)
            timelist[["t_pruned"]]     <- as.numeric(difftime(Sys.time(), t_tmp, units = "secs"))
        }

        dataset[["results"]][["tree_p_pruned"]]          <- res_tree_p_pruned
        dataset[["results"]][["tree_p_pruned_filtered"]] <- res_tree_p_pruned_filtered

    }

    ## --------------------------------------------------

    ## Stop time and duration
    timelist[["t_total"]] <- as.numeric(difftime(Sys.time(), t_start, units = "secs"))

    dataset[["results"]][["attributes_sorted"]]      <- attributes_sorted
    dataset[["results"]][["tree"]]                   <- res_tree
    dataset[["results"]][["tree_p"]]                 <- res_tree_p
    dataset[["results"]][["tree_p_filtered"]]        <- res_tree_p_filtered
    dataset[["results"]][["classifier"]]             <- classifier_str
    dataset[["results"]][["original_accuracy"]]      <- sid_get_goodness(dataset$data_train, dataset$data_test, classifier = classifier)
    dataset[["results"]][["number_of_samples"]]      <- R
    dataset[["results"]][["R"]]                      <- R ## same as the number of samples
    dataset[["results"]][["Rmin"]]                   <- Rmin
    dataset[["results"]][["Rmax"]]                   <- Rmax
    dataset[["results"]][["alpha"]]                  <- alpha
    dataset[["results"]][["z"]]                      <- z

    dataset[["results"]][["time"]]                   <- timelist

    dataset
}


#' Save results as rds file.
#'                                       #
#' Save results as rds file, automatically generating the filename based on the dataset name and the used classifier.
#'
#' @param res The results structure.
#' @param savepath Path where to save the resutls.
#'
#' @return Nothing.
#'
#' @export
save_result <- function(res, savepath) {
    ds <- res$properties$name
    cl <- res$results$classifier
    savename <- file.path(savepath, paste0(ds, "_", cl, ".rds"))
    saveRDS(res, file = savename, compress = "xz")
}


#' Load results.
#'                                       #
#' Load results in a given folder (where they were saved using
#' \code{save_result}, given the name of the dataset and the name of
#' the used classifier).
#'
#' @param datapath Directory with the data files.
#' @param dataset The name of the dataset.
#' @param classifier The name of the classifier.
#'
#' @return Nothing.
#'
#' @export
load_result <- function(datapath, dataset, classifier) {
    fname <- file.path(datapath, paste0(dataset, "_", classifier, ".rds"))
    if (file.exists(fname))
        readRDS(fname)
    else
        NULL
}


#' Perform experiments on UCI datasets (in arff format).
#'                                       #
#' Perform experiments on UCI datasets (in arff format).
#'
#' @param dataset_list List of datasets
#' @param classifier_list List of classifiers
#' @param datapath Directory with the data files.
#' @param savepath Path where to save the resutls.
#' @param R Number of samples to use. Default is\code{NULL} in which
#'     case R = 1 + (1 / alpha) will be used by
#'     \code{analyze_dataset}.
#' @param Rmin Ninimum number of replications to use for calculating p-values.
#' @param Rmax Maximum number of replications to use for calculating p-values.
#' @param alpha Significance level (default is 0.05).
#' @param z Parameter for confidence band for p-values. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param prune_singletons Should singletons be pruned. Default is \code{TRUE}.
#' @param seed_traintest Random seed used for splitting the data into training and testing sets, default is 42.
#' @param seed Random seed, default is 42.
#' @param early_stopping Use early stopping. Default is \code{FALSE} in which case Rmin samples are used to calculate p-values. If \code{TRUE} at least Rmin and at most Rmax values are used.
#' 
#' @return Nothing.
#'
#' @export
do_experiment_uci <- function(dataset_list, classifier_list, datapath, savepath, R = NULL, Rmin = NULL, Rmax = NULL, z = NULL, alpha = NULL, prune_singletons = FALSE, seed = 42, seed_traintest = 42, early_stopping = FALSE                ) {
    for (ds in dataset_list) {
        dataset <- read_uci_dataset(datapath, dataset = ds)

        for (cl in classifier_list) {
            msg <- paste0(ds, " -- ", cl, "\n")
            cat(msg)

            if (is.null(R)) {
                res <- analyze_dataset(dataset, classname = "class", classifier = cl, Rmin = Rmin, Rmax = Rmax, alpha = alpha, z = z, prune_singletons = prune_singletons, seed = seed, seed_traintest = seed_traintest, early_stopping = early_stopping)
            } else {
                res <- analyze_dataset(dataset, classname = "class", classifier = cl, R = R,  Rmin = Rmin, Rmax = Rmax, alpha = alpha, z = z, prune_singletons = prune_singletons, seed = seed, seed_traintest = seed_traintest, early_stopping = early_stopping)
            }

            save_result(res, savepath)
        }
    }
}


#' Perform experiments on synthetic datasets.
#'                                       #
#' Perform experiments on synthetic datasets.
#'
#' @param classifier_list List of classifiers
#' @param datapath Directory with the data files.
#' @param savepath Path where to save the resutls.
#' @param R Number of samples to use. Default is\code{NULL} in which
#'     case R = 1 + (1 / alpha) will be used by
#'     \code{analyze_dataset}.
#' @param Rmin Ninimum number of replications to use for calculating p-values.
#' @param Rmax Maximum number of replications to use for calculating p-values.
#' @param alpha Significance level (default is 0.05).
#' @param z Parameter for confidence band for p-values. Use 1.96 for 95 percent, 2.25 for 97.5 percent and 2.57 for 99 percent. Default is 2.57.
#' @param prune_singletons Should singletons be pruned. Default is \code{TRUE}.
#' @param seed_traintest Random seed used for splitting the data into training and testing sets, default is 42.
#' @param seed Random seed, default is 42.
#' @param early_stopping Use early stopping. Default is \code{FALSE} in which case Rmin samples are used to calculate p-values. If \code{TRUE} at least Rmin and at most Rmax values are used.
#'
#' @return Nothing.
#'
#' @export
do_experiment_synthetic <- function(classifier_list, datapath, savepath, R = NULL, Rmin = NULL, Rmax = NULL, z = NULL, alpha = NULL, prune_singletons = FALSE, seed = 42, seed_traintest = 42, early_stopping = FALSE) {
    dataset <- make_synthetic_dataset(N = 500, seed = 42, mg2 = 0.6)

    for (cl in classifier_list) {
        cat(cl, "\n")
        if (is.null(R)) {
            res <- analyze_dataset(dataset, classname = "class", classifier = cl, Rmin = Rmin, Rmax = Rmax, alpha = alpha, z = z, prune_singletons = prune_singletons, seed = seed, seed_traintest = seed_traintest, early_stopping = early_stopping)
        } else {
            res <- analyze_dataset(dataset, classname = "class", classifier = cl, R = R, Rmin = Rmin, Rmax = Rmax, alpha = alpha, z = z, prune_singletons = prune_singletons, seed = seed, seed_traintest = seed_traintest, early_stopping = early_stopping)
        }

        save_result(res, savepath)
    }
}


#' Compute the confidence intervals (CI) for the accuracy using the quantile method.
#'
#' @param avec A vector with accuracy values.
#'
#' @return The results with the CIs added.
#' @export
compute_ci <- function(avec, alpha = 5) {
    quantile(avec, probs = c(alpha/100, (100-alpha)/100), type = 1)
}