Nothing
#' Multivariate parallel archetypoid algorithm for large applications (ADALARA)
#'
#' @aliases adalara
#'
#' @description
#' The ADALARA algorithm is based on the CLARA clustering algorithm. This is the
#' parallel version of the algorithm to try to get faster results. It allows to
#' detect anomalies (outliers). There are two different methods to detect them:
#' the adjusted boxplot (default and most reliable option) and tolerance intervals.
#' If needed, tolerance intervals allow to define a degree of outlierness.
#'
#' @usage
#' adalara(data, N, m, numArchoid, numRep, huge, prob, type_alg = "ada",
#' compare = FALSE, vect_tol = c(0.95, 0.9, 0.85), alpha = 0.05,
#' outl_degree = c("outl_strong", "outl_semi_strong", "outl_moderate"),
#' method = "adjbox", frame)
#'
#' @param data Data matrix. Each row corresponds to an observation and each column
#' corresponds to a variable. All variables are numeric. The data must have row names
#' so that the algorithm can identify the archetypoids in every sample.
#' @param N Number of samples.
#' @param m Sample size of each sample.
#' @param numArchoid Number of archetypes/archetypoids.
#' @param numRep For each \code{numArchoid}, run the archetype algorithm \code{numRep}
#' times.
#' @param huge Penalization added to solve the convex least squares problems.
#' @param prob Probability with values in [0,1].
#' @param type_alg String. Options are 'ada' for the non-robust adalara algorithm and
#' 'ada_rob' for the robust adalara algorithm.
#' @param compare Boolean argument to compute the robust residual sum of squares
#' if \code{type_alg = "ada"} and the non-robust if \code{type_alg = "ada_rob"}.
#' @param vect_tol Vector with the tolerance values. Default c(0.95, 0.9, 0.85).
#' Needed if \code{method='toler'}.
#' @param alpha Significance level. Default 0.05. Needed if \code{method='toler'}.
#' @param outl_degree Type of outlier to identify the degree of outlierness.
#' Default c("outl_strong", "outl_semi_strong", "outl_moderate").
#' Needed if \code{method='toler'}.
#' @param method Method to compute the outliers. Options allowed are 'adjbox' for
#' using adjusted boxplots for skewed distributions, and 'toler' for using
#' tolerance intervals.
#' @param frame Boolean value to indicate whether the frame is
#' computed (Mair et al., 2017) or not. The frame is made up of a subset of
#' extreme points, so the archetypoids are only computed on the frame.
#' Low frame densities are obtained when only small portions of the data were extreme.
#' However, high frame densities reduce this speed-up.
#'
#' @return
#' A list with the following elements:
#' \itemize{
#' \item cases Optimal vector of archetypoids.
#' \item rss Optimal residual sum of squares.
#' \item outliers: Outliers.
#' }
#'
#' @author
#' Guillermo Vinue, Irene Epifanio
#'
#' @seealso
#' \code{\link{do_ada}}, \code{\link{do_ada_robust}}, \code{\link{adalara_no_paral}}
#'
#' @references
#' Eugster, M.J.A. and Leisch, F., From Spider-Man to Hero - Archetypal Analysis in
#' R, 2009. \emph{Journal of Statistical Software} \bold{30(8)}, 1-23,
#' \url{https://doi.org/10.18637/jss.v030.i08}
#'
#' Hubert, M. and Vandervieren, E., An adjusted boxplot for skewed distributions, 2008.
#' \emph{Computational Statistics and Data Analysis} \bold{52(12)}, 5186-5201,
#' \url{https://doi.org/10.1016/j.csda.2007.11.008}
#'
#' Kaufman, L. and Rousseeuw, P.J., Clustering Large Data Sets, 1986.
#' \emph{Pattern Recognition in Practice}, 425-437.
#'
#' Mair, S., Boubekki, A. and Brefeld, U., Frame-based Data Factorizations, 2017.
#' Proceedings of the 34th International Conference on Machine Learning,
#' Sydney, Australia, 1-9.
#'
#' Moliner, J. and Epifanio, I., Robust multivariate and functional archetypal analysis
#' with application to financial time series analysis, 2019.
#' \emph{Physica A: Statistical Mechanics and its Applications} \bold{519}, 195-208.
#' \url{https://doi.org/10.1016/j.physa.2018.12.036}
#'
#' Vinue, G., Anthropometry: An R Package for Analysis of Anthropometric Data, 2017.
#' \emph{Journal of Statistical Software} \bold{77(6)}, 1-39,
#' \url{https://doi.org/10.18637/jss.v077.i06}
#'
#' @examples
#' \dontrun{
#' library(Anthropometry)
#' library(doParallel)
#'
#' # Prepare parallelization (including the seed for reproducibility):
#' no_cores <- detectCores() - 1
#' cl <- makeCluster(no_cores)
#' registerDoParallel(cl)
#' clusterSetRNGStream(cl, iseed = 1)
#'
#' # Load data:
#' data(mtcars)
#' data <- mtcars
#' n <- nrow(data)
#'
#' # Arguments for the archetype/archetypoid algorithm:
#' # Number of archetypoids:
#' k <- 3
#' numRep <- 2
#' huge <- 200
#'
#' # Size of the random sample of observations:
#' m <- 10
#' # Number of samples:
#' N <- floor(1 + (n - m)/(m - k))
#' N
#'
#' prob <- 0.75
#'
#' # ADALARA algorithm:
#' preproc <- preprocessing(data, stand = TRUE, percAccomm = 1)
#' data1 <- as.data.frame(preproc$data)
#'
#' adalara_aux <- adalara(data1, N, m, k, numRep, huge, prob,
#' "ada_rob", FALSE, method = "adjbox", frame = FALSE)
#'
#' #adalara_aux <- adalara(data1, N, m, k, numRep, huge, prob,
#' # "ada_rob", FALSE, vect_tol = c(0.95, 0.9, 0.85), alpha = 0.05,
#' # outl_degree = c("outl_strong", "outl_semi_strong", "outl_moderate"),
#' # method = "toler", frame = FALSE)
#'
#' # Take the minimum RSS, which is in the second position of every sublist:
#' adalara <- adalara_aux[which.min(unlist(sapply(adalara_aux, function(x) x[2])))][[1]]
#' adalara
#'
#' # End parallelization:
#' stopCluster(cl)
#' }
#'
#' @importFrom foreach foreach %dopar%
#' @importFrom tolerance nptol.int
#' @importFrom utils relist
#'
#' @export
adalara <- function(data, N, m, numArchoid, numRep, huge, prob, type_alg = "ada",
compare = FALSE, vect_tol = c(0.95, 0.9, 0.85), alpha = 0.05,
outl_degree = c("outl_strong", "outl_semi_strong", "outl_moderate"),
method = "adjbox", frame){
i <- NULL
n <- nrow(data)
#rss_aux <- list()
#k_aux <- list()
#out_tol <- list()
rss_aux <- Inf
rand_obs_iter <- c()
res <- foreach(i = 1:N,
.packages = c("Anthropometry", "archetypes", "nnls",
"adamethods", "tolerance", "utils")) %dopar% { # .options.RNG = seed
# Generate subset:
if (is.null(rand_obs_iter)) {
#set.seed(seed)
#set.seed(1)
rand_obs_si <- sample(1:n, size = m)
}else{
#set.seed(seed)
#set.seed(1)
rand_obs_si <- sample(setdiff(1:n, rand_obs_iter), size = m - numArchoid)
rand_obs_si <- c(rand_obs_si, k_aux)
}
# To accumulate the already sampled individuals:
rand_obs_iter <- c(rand_obs_iter, rand_obs_si)
# Use ADA on si:
si <- data[rand_obs_si,]
# Apply the ADA algorithm on si to compute k_si archetypoids:
if (type_alg == "ada") {
ada_si <- do_ada(si, numArchoid, numRep, huge, compare, vect_tol, alpha,
outl_degree, method, prob)
}else if (type_alg == "ada_rob") {
if (frame) {
si_frame <- frame_in_r(si)
si <- si[si_frame,]
rand_obs_si <- rand_obs_si[si_frame]
}
ada_si <- do_ada_robust(si, numArchoid, numRep, huge, prob, compare, vect_tol, alpha,
outl_degree, method)
}else{
stop("Algorithms available are 'ada' or 'ada_rob'.")
}
k_si <- ada_si$cases
alphas_si <- ada_si$alphas
colnames(alphas_si) <- rownames(si)
# For every observation in data, compute alpha_{k_si} and RSS_{k s_i}:
rss_si <- do_alphas_rss(data, si, huge, k_si, rand_obs_si, alphas_si, type_alg, prob = prob)
if (rss_si[[1]] < rss_aux) {
rss_aux <- rss_si[[1]]
# Remember to get the right position of the archetypoids regarding the whole data:
k_aux <- which(rownames(data) %in% rownames(si)[k_si])
# Compute outliers from the set of archetypoids:
if (method == "adjbox") {
resid_vect <- apply(rss_si[[2]], 2, int_prod_mat_sq)
outl_boxb <- boxB(x = resid_vect, k = 1.5, method = method)
outl <- which(resid_vect > outl_boxb$fences[2])
}else if (method == "toler") {
resid_vect <- apply(rss_si[[2]], 2, int_prod_mat)
# Degree of outlierness:
outl <- do_outl_degree(vect_tol, resid_vect, alpha,
paste(outl_degree, "_non_rob", sep = ""))
}
list(cases = k_aux, rss = rss_aux, outliers = outl)#, improve = "YES")
}else{
list(cases = NA, rss = rss_si[[1]], outliers = NA)#, improve = "NO")
}
#return(list(cases = k_aux, rss = rss_aux, outliers = out_tol))
}
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.