#' Clustering multiple regression curves
#'
#' @description Function for grouping regression curves based on the k-means or
#' k-medians algorithm. It returns the number of groups and the assignment.
#'
#' @param y Response variable.
#' @param x Dependent variable.
#' @param z Categorical variable indicating the population to which
#' the observations belongs.
#' @param kvector A vector specifying the number of groups of curves to be
#' checking.
#' @param kbin Size of the grid over which the survival functions
#' are to be estimated.
#' @param h The kernel bandwidth smoothing parameter.
#' @param nboot Number of bootstrap repeats.
#' @param algorithm A character string specifying which clustering algorithm is used,
#' i.e., k-means(\code{"kmeans"}) or k-medians (\code{"kmedians"}).
#' @param alpha Significance level of the testing procedure. Defaults to 0.05.
#' @param cluster A logical value. If \code{TRUE} (default), the
#' testing procedure is parallelized. Note that there are cases
#' (e.g., a low number of bootstrap repetitions) that R will gain in
#' performance through serial computation. R takes time to distribute tasks
#' across the processors also it will need time for binding them all together
#' later on. Therefore, if the time for distributing and gathering pieces
#' together is greater than the time need for single-thread computing, it does
#' not worth parallelize.
#' @param ncores An integer value specifying the number of cores to be used
#' in the parallelized procedure. If \code{NULL} (default), the number of cores
#' to be used is equal to the number of cores of the machine - 1.
#' @param seed Seed to be used in the procedure.
#' @param multiple A logical value. If \code{TRUE} (not default), the resulted
#' pvalues are adjusted by using one of several methods for multiple comparisons.
#' @param multiple.method Correction method. See Details.
#'
#'@return
#'A list containing the following items:
#' \item{table}{A data frame containing the null hypothesis tested, the values
#' of the test statistic and the obtained pvalues.}
#' \item{levels}{Original levels of the variable \code{z}.}
#' \item{cluster}{A vector of integers (from 1:k) indicating the cluster to
#' which each curve is allocated.}
#' \item{centers}{An object containing the centroids
#' (mean of the curves pertaining to the same group).}
#' \item{curves}{An object containing the fitted curves for each population.}
#'
#'@details The adjustment methods include the Bonferroni correction ("bonferroni")
#' in which the p-values are multiplied by the number of comparisons.
#' Less conservative corrections are also included by Holm (1979) ('holm'),
#' Hochberg (1988) ('hochberg'), Hommel (1988) ('hommel'),
#' Benjamini & Hochberg (1995) ('BH' or its alias 'fdr'), and
#' Benjamini & Yekutieli (2001) ('BY'), respectively.
#' A pass-through option ('none') is also included.
#'
#'
#'@author Nora M. Villanueva and Marta Sestelo.
#'
#'@examples
#' library(clustcurv)
#'
#'# Regression framework
#' res <- regclustcurves(y = barnacle5$DW, x = barnacle5$RC, z = barnacle5$F,
#' algorithm = 'kmeans', nboot = 2, cluster = TRUE, ncores = 2)
#'
#' @importFrom survival Surv
#' @importFrom survival survfit
#' @importFrom parallel detectCores
#' @importFrom parallel makeCluster
#' @importFrom parallel parLapply
#' @importFrom parallel stopCluster
#' @importFrom doRNG %dorng% registerDoRNG
#' @importFrom foreach %do% foreach
#' @importFrom foreach %dopar% foreach
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom splines interpSpline
#' @importFrom npregfast frfast
#' @importFrom stats lm predict
#' @export
regclustcurves <- function(y, x, z,
kvector = NULL, kbin = 50, h = -1,
nboot = 100, algorithm = 'kmeans', alpha = 0.05,
cluster = FALSE, ncores = NULL, seed = NULL,
multiple = FALSE, multiple.method = 'holm'){
method <- "regression"
# Defining error codes
error.code.0 <- "Argument seed must be an object of type numeric."
#error.code.1 <- "Argument method must be a string with 'survival' or 'regression'."
error.code.2 <- "Argument algorithm must be a string with 'kmeans' or 'kmedians'."
error.code.3 <- "Argument multiple must be an object of type logical."
error.code.4 <- "Argument multiple.method must be an object of type string."
error.code.5 <- "Argument multiple.method must be some of the correction methods: 'bonferroni', 'holm', 'hochberg', etc."
error.code.6 <- "Argument kvector must be an object of type numeric."
#error.code.7 <- "Argument weights is missing and it is required when method is 'survival'."
#error.code.8 <- "Argument weights must be a vector of binary numbers."
error.code.9 <- "Argument x is missing and it is required."
#error.code.10 <- "Argument y is missing and it is required when method is 'survival'."
error.code.11 <- "Argument y is missing and it is required."
# # Checking method as strings and type
# if(missing(method)){
# stop(error.code.1)
# }else if (!is.character(method) ) {
# stop(error.code.1)
# }else if (nchar(method)!= 8 & nchar(method)!= 10) {
# stop(error.code.1)
# }else if(method != 'survival' & method != 'regression') {
# stop(error.code.1)
# }
# Checking algorithm as string and type
if (!is.character(algorithm) ) {
stop(error.code.2)
}else if (nchar(algorithm)!= 6 & nchar(algorithm)!= 8) {
stop(error.code.2)
}else if(algorithm != 'kmeans' & algorithm != 'kmedians') {
stop(error.code.2)
}
# Checking multiple as logical
if (!is.logical(multiple) ) {
stop(error.code.3)
}
# Checking multiple.method as string
if (!is.character(multiple.method) ) {
stop(error.code.4)
}else if(nchar(multiple.method) == 0){
stop(error.code.5)
}
# Checking seed as numeric
if (!is.null(seed)) {
if(!is.numeric(seed)){
stop(error.code.0)
}
set.seed(seed)
}
# Checking kvector
if(is.null(kvector)) {
kvector <- c(1:(length(unique(z))-1))
}else{
if(!is.numeric(kvector)){
stop(error.code.6)
}
}
#---------------
if (isTRUE(cluster)) {
if (is.null(ncores)) {
num_cores <- parallel::detectCores() - 1
}else{
num_cores <- ncores
}
registerDoParallel(cores = num_cores)
if (!is.null(seed)) registerDoRNG(seed)
on.exit(stopImplicitCluster())
}
#------------------
time <- y
#status <- weights
fac <- z
accept <- 0
ii <- 1
pval <- NA
tval <- NA
h0tested <- NA
aux <- list()
for (k in kvector){
if(k == 1){
cat(paste("Checking",k, "cluster...", "\n"), sep = "")
}else{
cat(paste("Checking",k, "clusters...", "\n"), sep = "")
}
# if(method == 'survival'){
# # if(missing(weights)) {
# # stop(error.code.7)
# # }else if(length(unique(weights)) > 2){
# # stop(error.code.8)
# # }else if(sum(unique(weights))>1 | sum(unique(weights))<0){
# # stop(error.code.8)
# # }
# #
# # if(missing(y)) {
# # stop(error.code.10)
# # }
#
# aux[[ii]] <- testing_k(time = time, status = status, fac = fac, k = k,
# kbin = kbin, nboot = nboot, algorithm = algorithm,
# seed = seed, cluster = cluster)
# data <- NULL
#
# }else
if(method == 'regression'){
if(missing(x)) {
stop(error.code.9)
}
if(missing(y)) {
stop(error.code.11)
}
aux[[ii]] <- kgroups(x = x, y = y, f = z, nboot = nboot, K = k,
h = h, ngrid = kbin, algorithm = algorithm, seed = seed,
cluster = cluster)
}
pval[ii] <- aux[[ii]]$pvalue
tval[ii] <- aux[[ii]]$t
h0tested[ii] <- k
if(isTRUE(multiple)){
pval <- p.adjust(pval, method = multiple.method)
ind <- pval >= alpha
if(sum(ind) > 0){
ind_list <- which(ind)[1]
k <- kvector[ind_list]
aux <- aux[[ind_list]]
accept <- 1
break
}
}
if(aux[[ii]]$pvalue >= alpha){
aux <- aux[[ii]]
accept <- 1
break
}
ii <- ii + 1
}
if (accept == 1) {
if(k == 1){
cat("\n")
cat(paste("Finally, there is one cluster.", "\n"), sep = "")
}else{
cat("\n")
cat(paste("Finally, there are",k, "clusters.", "\n"), sep = "")
}
# muhat under h0 and under h1
if(method == 'survival'){
h0 <- survfit(Surv(time, status) ~ aux$cluster[fac])
h1 <- survfit(Surv(time, status) ~ fac)
}else{
data <- data.frame(x = x, y = y, f = z)
#h0 <- aux$centers
data0 <- data
data0$f <- aux$levels[aux$cluster[data$f]]
h0 <- by(data0, data0$f, muhatrfast2, h = h, kbin = kbin)
#h1 <- aux$muhat
h1 <- by(data, data$f, muhatrfast2, h = h, kbin = kbin)
}
}else{
k <- paste( ">", k, sep ="")
aux$levels <- NA
aux$cluster <- NA
h0 <- NA
if(method == 'survival'){
h1 <- survfit(Surv(time, status) ~ fac)
}else{
#h1 <- aux$muhat
data <- data.frame(x = x, y = y, f = z)
h1 <- by(data, data$f, muhatrfast2, h = h, kbin = kbin)
}
cat("\n")
cat(paste("The number 'k' of clusters has not been found, try another kvector.", "\n"), sep = "")
}
res <- list(num_groups = k, table = data.frame(H0 = h0tested, Tvalue = tval, pvalue = pval),
levels = aux$levels, cluster = as.numeric(aux$cluster),
centers = h0, curves = h1, method = method, data = data, algorithm = algorithm,
call = match.call())
class(res) <- "clustcurves"
return(res)
} #end clustcurv
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.