# R/GFCM.R In geocmeans: Implementing Methods for Spatial Fuzzy Unsupervised Classification

#### Documented in calcFGCMBelongMatrixcalcSFGCMBelongMatrixGCMeansSGFCMeans

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#### GFCM ####
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' @title Calculate the generalized membership matrix
#'
#' @description Calculate the generalized membership matrix according to a set of
#' centroids, the observed data, the fuzziness degree, and a beta parameter
#'
#' @param centers A matrix or a dataframe representing the centers of the
#'   clusters with p columns and k rows
#' @param data A dataframe or matrix representing the observed data with n rows
#'   and p columns
#' @param m A float representing the fuzziness degree
#' @param beta A float for the beta parameter (control speed convergence and classification crispness)
#' @return A n * k matrix representing the belonging probabilities of each
#'   observation to each cluster
#' @keywords internal
#' @examples
#' #This is an internal function, no example provided
calcFGCMBelongMatrix <- function(centers, data, m, beta ){
#calculating euclidean distance between each observation and each center
centerdistances <- apply(centers, 1, function(x) {
return(calcEuclideanDistance(data, x))
})

# calculating aj
aj <- beta * apply(centerdistances,1,min)

K <- nrow(centers)
uij <- sapply(1:K, function(i){
dij <- centerdistances[,i]
denomVals <- sapply(1:K,function(k){
dkj <- centerdistances[,k]
return(((dij-aj) / (dkj-aj))**(1/(m-1)))
})
denom <- rowSums(denomVals)
return(1/denom)
})
uij <- ifelse(is.na(uij),1,uij)
return(uij)
}

#' @title Generalized C-means
#'
#' @description The generalized c-mean algorithm
#'
#' @param data A dataframe with only numerical variable
#' @param k An integer describing the number of cluster to find
#' @param m A float for the fuzziness degree
#' @param beta A float for the beta parameter (control speed convergence and classification crispness)
#' @param maxiter A float for the maximum number of iteration
#' @param tol The tolerance criterion used in the evaluateMatrices function for
#'   convergence assessment
#' @param standardize A boolean to specify if the variables must be centered and
#'   reduced (default = True)
#' @param verbose A boolean to specify if the messages should be displayed
#' @param init A string indicating how the initial centers must be selected. "random"
#' indicates that random observations are used as centers. "kpp" use a distance based method
#' resulting in more dispersed centers at the beginning. Both of them are heuristic.
#' @param seed An integer used for random number generation. It ensures that the
#' start centers will be the same if the same integer is selected.
#' @return A named list with :
#'  \itemize{
#'         \item Centers: a dataframe describing the final centers of the groups
#'         \item Belongings: the final membership matrix
#'         \item Groups: a vector with the names of the most likely group for each observation
#'         \item Data: the dataset used to perform the clustering (might be standardized)
#' }
#' @export
#' @examples
#' data(LyonIris)
#' AnalysisFields <-c("Lden","NO2","PM25","VegHautPrt","Pct0_14","Pct_65","Pct_Img",
#' "TxChom1564","Pct_brevet","NivVieMed")
#' dataset <- LyonIris@data[AnalysisFields]
#' result <- GCMeans(dataset,k = 5, m = 1.5, beta = 0.5, standardize = TRUE)
GCMeans <- function(data, k, m, beta, maxiter = 500, tol = 0.01, standardize = TRUE, verbose = TRUE, init = "random", seed = NULL) {
# standardize data if required
if (standardize) {
if (verbose){
print("Standardizing the data (set parameter to FALSE to avoid this step)")
}
for (i in 1:ncol(data)) {
data[, i] <- scale(data[, i])
}
}

data <- as.matrix(data)
results <- main_worker("GFCM", data = data,
k = k, m = m, beta = beta,
maxiter = maxiter, tol = tol,
standardize = standardize, verbose = verbose,
seed = seed, init = init)
return(results)
}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#### SGFCM ####
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' @title Calculate the generalized membership matrix (spatial version)
#'
#' @description Calculate the generalized membership matrix (spatial version) according to a set of
#' centroids, the observed data, the fuzziness degree a neighbouring matrix,
#' a spatial weighting term and a beta parameter
#'
#' @param centers A matrix or a dataframe representing the centers of the
#'   clusters with p columns and k rows
#' @param data A dataframe or matrix representing the observed data with n rows
#'   and p columns
#' @param wdata A dataframe or matrix representing the lagged observed data with
#'   nrows and p columns
#' @param m A float representing the fuzziness degree
#' @param alpha A float representing the weight of the space in the analysis (0
#'   is a typical fuzzy-c-mean algorithm, 1 is balanced between the two
#'   dimensions, 2 is twice the weight for space)
#' @param beta A float for the beta parameter (control speed convergence and classification crispness)
#' @return A n * k matrix representing the belonging probabilities of each
#'   observation to each cluster
#' @keywords internal
#' @examples
#' #This is an internal function, no example provided
calcSFGCMBelongMatrix <- function(centers, data, wdata, m, alpha, beta ){
#calculating euclidean distance between each observation and each center
centerdistances <- apply(centers, 1, function(x) {
return(calcEuclideanDistance(data, x))
})

#same for the lagged data
Wcenterdistances <- apply(centers, 1, function(x) {
return(calcEuclideanDistance(wdata, x))
})

# calculating aj
aj <- beta * apply(centerdistances,1,min)

K <- nrow(centers)
uij <- sapply(1:K, function(i){
dij <- centerdistances[,i]
Wdij <- Wcenterdistances[,i]
denomVals <- sapply(1:K,function(k){
dkj <- centerdistances[,k]
Wdkj <- Wcenterdistances[,k]
num <- (dij-aj) + alpha * (Wdij)
denum <- (dkj-aj) + alpha * (Wdkj)
return(( num / denum)**(1/(m-1)))
})
denom <- rowSums(denomVals)
return(1/denom)
})
uij <- ifelse(is.na(uij),1,uij)
return(uij)
}

#' @title SGFCMeans
#'
#' @description spatial version of the generalized c-mean algorithm (SGFCMeans)
#'
#' @details The implementation is based on the following article : \doi{10.1016/j.dsp.2012.09.016}.\cr
#'
#' the matrix of belonging (u) is calculated as follow \cr
#' \deqn{u_{ik} = \frac{(||x_{k} - v{_i}||^2 -b_k + \alpha||\bar{x_{k}} - v{_i}||^2)^{(-1/(m-1))}}{\sum_{j=1}^c(||x_{k} - v{_j}||^2 -b_k + \alpha||\bar{x_{k}} - v{_j}||^2)^{(-1/(m-1))}}}
#'
#' the centers of the groups are updated with the following formula
#' \deqn{v_{i} = \frac{\sum_{k=1}^N u_{ik}^m(x_{k} + \alpha\bar{x_{k}})}{(1 + \alpha)\sum_{k=1}^N u_{ik}^m}}
#'
#' with
#' \itemize{
#' \item vi the center of the group vi
#' \item xk the data point k
#' \item xk_bar the spatially lagged data point k
#' \deqn{b_k = \beta \times min(||x_{k} - v||)}
#' }
#'
#' @param data A dataframe with only numerical variable
#' @param nblistw A list.w object describing the neighbours typically produced
#'   by the spdep package
#' @param k An integer describing the number of cluster to find
#' @param m A float for the fuzziness degree
#' @param alpha A float representing the weight of the space in the analysis (0
#'   is a typical fuzzy-c-mean algorithm, 1 is balanced between the two
#'   dimensions, 2 is twice the weight for space)
#' @param beta A float for the beta parameter (control speed convergence and classification crispness)
#' @param lag_method A string indicating if a classical lag must be used
#' ("mean") or if a weighted median must be used ("median")
#' @param maxiter An integer for the maximum number of iteration
#' @param tol The tolerance criterion used in the evaluateMatrices function for
#'   convergence assessment
#' @param standardize A boolean to specify if the variable must be centered and
#'   reduced (default = True)
#' @param verbose A boolean to specify if the progress bar should be displayed
#' @param init A string indicating how the initial centers must be selected. "random"
#' indicates that random observations are used as centers. "kpp" use a distance based method
#' resulting in more dispersed centers at the beginning. Both of them are heuristic.
#' @param seed An integer used for random number generation. It ensures that the
#' start centers will be the same if the same integer is selected.
#' @return A named list with
#' \itemize{
#'         \item Centers: a dataframe describing the final centers of the groups
#'         \item Belongings: the final membership matrix
#'         \item Groups: a vector with the names of the most likely group for each observation
#'         \item Data: the dataset used to perform the clustering (might be standardized)
#' }
#' @export
#' @examples
#' data(LyonIris)
#' AnalysisFields <-c("Lden","NO2","PM25","VegHautPrt","Pct0_14","Pct_65","Pct_Img",
#' "TxChom1564","Pct_brevet","NivVieMed")
#' dataset <- LyonIris@data[AnalysisFields]
#' queen <- spdep::poly2nb(LyonIris,queen=TRUE)
#' Wqueen <- spdep::nb2listw(queen,style="W")
#' result <- SGFCMeans(dataset, Wqueen,k = 5, m = 1.5, alpha = 1.5, beta = 0.5, standardize = TRUE)
SGFCMeans <- function(data, nblistw, k, m, alpha, beta, lag_method="mean", maxiter = 500, tol = 0.01, standardize = TRUE, verbose = TRUE, init = "random", seed = NULL) {

if(class(nblistw)[] != "listw"){
stop("the nblistw must be a listw object from spdep package")
}

if(lag_method %in% c("mean","median") == FALSE){
stop("the parameter lag_method must be one of 'mean' or 'median'")
}

# standardize data if required
if (standardize) {
if (verbose){
print("Standardizing the data (set parameter to FALSE to avoid this step)")
}
for (i in 1:ncol(data)) {
data[, i] <- scale(data[, i])
}
}

# calculating the lagged dataset
wdata <- calcLaggedData(data,nblistw,lag_method)

data <- as.matrix(data)
wdata <- as.matrix(wdata)

results <- main_worker('SGFCM', data = data, wdata = wdata,
nblistw = nblistw, k = k, m = m, alpha = alpha,
beta = beta, lag_method=lag_method, maxiter = maxiter,
tol = tol, standardize = standardize, verbose = verbose,
seed = seed, init = init)
return(results)
}


## Try the geocmeans package in your browser

Any scripts or data that you put into this service are public.

geocmeans documentation built on April 21, 2021, 9:07 a.m.