# R/default.mass.R In shallot: Random Partition Distribution Indexed by Pairwise Information

#' #' Association Matrix
#' #'
#' #' This function creates an association matrix for a clustering/partition.
#' #' The (i,j) element of the matrix is 1 if item i and j are in the same
#' #' cluster/subset and 0 otherwise.
#' #'
#' #' @param cl A vector containing cluster labels for a clustering/partition.
#' #' @return A matrix of 0s and 1s indicating whether items i and j are in
#' #' the same cluster/subset.
#' #' @examples
#' #'
#' #' cl <- rep(1:3,times=c(2,4,3))
#' #' association.matrix(cl)
#' #'
#' #' @export association.matrix
#' association.matrix <- function(cl){
#'   n <- length(cl)
#'   cl1 <- rep(cl,n)
#'   cl2 <- rep(cl,each=n)
#'   matrix(cl1==cl2, ncol=n)*1
#' }
#'
#'
#' #' Variance Ratio
#' #'
#' #' This function calculates the variance of the expected pairwise allocation
#' #' matrix (EPAM) within clusters/subsets over the total variance of the expected
#' #' pairwise allocation matrix.
#' #'
#' #' The \code{\link{variance.ratio}} function takes as input an object of class
#' #' \code{salso.confidence} and calculates the variance ratio for the estimated
#' #' partition from the corresponding expected pairwise allocation matrix (EPAM).
#' #'
#' #' The variance ratio is the weighted average of the within cluster variances of
#' #' the EPAM, weighted by the number of pairwise EPAM values per cluster, over
#' #' the total variance of the EPAM.
#' #'
#' #' @param x,y If \code{y} is not specified then \code{x} must be an object of
#' #'   class \code{salso.confidence}. Otherwise, \code{x} is a vector of cluster
#' #'   labels and \code{y} is an expected pairwise allocation matrix.
#' #' @return A vector of variance ratios.
#' #' @examples
#' #' x <- rep(c(1,2,3), times=c(2,3,5))
#' #' y <- diag(10)
#' #' y[upper.tri(y)] <- runif(45)
#' #' variance.ratio(x,y)
#' #'
#' #' @family Default Mass Selection
#' #' @export
#' variance.ratio <- function(x,y) {
#'   if (inherits(x,"salso.confidence")) {
#'     cl.am <- association.matrix(x$estimate) #' y <- x$psm
#'     x <- x$estimate #' } else { #' cl.am <- association.matrix(x) #' } #' cl.vars <- sapply(unique(x), function(i) { #' if (sum(x == i) < 3) return(c(0,1)) #' cl.am[x != i, x != i] <- 0 #' z <- y[upper.tri(y) & cl.am] #' c(sum((mean(z) - z)^2)/(length(z)-1),length(z)) #' }) #' #' cl.var <- sum(apply(cl.vars,2,prod)/sum(cl.vars[2,])) #' z <- y[upper.tri(y)] #' tot.var <- sum((mean(z) - z)^2)/(length(z) - 1) #' #' cl.var/tot.var #' } #' #' #' Partition Confidence #' #' #' #' This function calculates the partition confidence of a partition estimate #' #' from the corresponding expected pairwise allocation matrix (EPAM). #' #' #' #' The \code{\link{partition.confidence}} takes as input an object of class #' #' \code{salso.confidence} and then calculates the partition confidence from the #' #' expected pairwise allocation matrix. #' #' #' #' The partition confidence is the average values of the EPAM for items that are #' #' clustered together. Items which are in their own subset do not contribute to #' #' partition confidence. #' #' #' #' @inheritParams variance.ratio #' #' @return A vector of partition confidences. #' #' #' #' @examples #' #' x <- rep(c(1,2,3), times=c(2,3,5)) #' #' y <- diag(10) #' #' y[upper.tri(y)] <- runif(45) #' #' partition.confidence(x,y) #' #' #' #' @family Default Mass Selection #' #' @export #' partition.confidence <- function(x,y){ #' if (inherits(x,"salso.confidence")) { #' if (length(unique(x$estimate)) == length(x$estimate)) return(0) #' cl.am <- association.matrix(x$estimate)
#'     y <- x$psm #' } else { #' cl.am <- association.matrix(x) #' } #' mean(y[upper.tri(y) & cl.am]) #' } #' #' #' #' Mass Selection Algorithm #' #' #' #' This function selects the optimal mass value for Cluster Analysis via Random #' #' Partition distributions using the Ewens-Pitman attraction distribution. #' #' #' #' The \code{\link{mass.algorithm}} function is used internally in the #' #' \code{\link{default.mass}} function. #' #' #' #' The default value for \code{w} is \code{c(1,1,1)}. #' #' #' #' The general algorithm is as follows: \enumerate{ \item Rank the partition #' #' confidence (\code{pc}) and variance ratio (\code{vr}). Select the #' #' \code{mass_i} value which minimizes the weigthed sum of \eqn{w_1 pc_i + w_2 #' #' vr_i + w_3 n_i}. } #' #' #' #' The two stage algorithm proceeds as follows: \enumerate{ \item Rank the #' #' partition confidence (\code{pc}) and variance ratio (\code{vr}). For each #' #' number of clusters \code{n} select the index which minimizes the weigthed sum #' #' of \eqn{w_1 pc_i + w_2 vr_i}. \item Rerank the \code{pc} and \code{vr} of the #' #' selected indices and select the \code{mass_i} value which minimizes the #' #' weigthed sum of \eqn{w_1 pc_i + w_2 vr_i + w_3 n_i} from among the selected #' #' indices. } #' #' #' #' #' #' @param mass a vector of mass values #' #' @param pc a vector of partition confidences for the partition estimates at #' #' the corresponding mass values #' #' @param vr a vector of variance ratios for the partition estimates at the #' #' corresponding mass values #' #' @param n a vector of the number of subsets in the partition estimates at the #' #' correpsonding mass values #' #' @param w a vector of length 3 specifying the weights of \code{pc}, \code{vr}, #' #' and \code{n} #' #' @param two.stage logical; if \code{TRUE}, the two stage algorithm is #' #' implemented #' #' #' #' @return A matrix containing the best' \code{mass} value and corresponding #' #' values for \code{pc}, \code{vr}, and \code{n}. The matrix also contains the #' #' mass values for the partitions estimate with more one more and one less #' #' subset that the selected mass value. #' #' #' #' @family Default Mass Selection #' #' @importFrom graphics abline lines plot #' #' @export #' #' #' mass.algorithm <- function(mass,pc,vr,n,w=c(1,1,1),two.stage=TRUE) { #' if (length(w) != 3) stop("Weights must be assigned to: PC VR N") #' #' compute.rankings <- function(subset=1:length(pc)) { #' rankings <- rank(1-pc[subset])*w[1] + rank(vr[subset])*w[2] + n[subset]*w[3] #' if (length(rankings) > 3 && FALSE) { #' best <- which.min(sapply(1:(length(rankings)-2), function(i) sum(rankings[i:(i+2)]))) + 1 #' } else { #' best <- which.min(rankings) #' } #' #' c(n=n[subset][best], mass = mass[subset][best], pc=pc[subset][best], vr=vr[subset][best]) #' } #' #' if (two.stage) { #' all <- sapply(unique(n[n>1]), function(i) compute.rankings(n == i)) #' rankings <- rank(1-all["pc",])*w[1] + rank(all["vr",])*w[2] + all["n",]*w[3] #' index <- which.min(rankings) #' if (index >= ncol(all)) { #' out <- t(all[,c(best=index, index-1)]) #' } else { #' out <- t(all[,c(best=index, index-1, index+1)]) #' } #' } else { #' best <- compute.rankings(n>1) #' index <- which(unique(n[n>1]) == best["n"]) #' others <- sapply(unique(n[n>1])[index + c(-1,1)], function(i) compute.rankings(n == i)) #' #' out <- rbind(best,t(others)) #' } #' #' plot(mass,pc,type="l",ylim=c(0,1),col="dodgerblue", ylab="") #' lines(vr[n>1]~mass[n>1],col="forestgreen") #' abline(v=out[1,"mass"]) #' for (i in 2:nrow(out)) { #' abline(v=out[i,"mass"], lty=2) #' } #' #' out #' } #' #' #' Default Mass Selection #' #' #' #' This function selects an optimal mass value for Cluster Analysis via Random #' #' Partition Distribtuions, using the Ewens-Pitman Attraction distribution. #' #' #' #' The function draws \code{n.draws} partitions at each specified mass value. If #' #' a vector of mass values is not given, then the default of #' #' \code{seq(0.1,10,0.2)} is used for loss #' #' \code{"VI.lb"} and \code{seq(0.1,5,0.05)} used for #' #' the other loss functions. #' #' #' #' If a list of expected pairwise allocation matrices (EPAM) is provided, #' #' additional draws at matching mass values are added to the corresponding #' #' matrix. Additionally, no new draws are needed for estimation, if a list of #' #' EPAMs is provided. #' #' #' #' A partition/clustering estimate from each EPAM is obtained using the SALSO #' #' method in \code{\link[salso]{salso}}. The estimate given minimizes the #' #' specified \code{loss} function with respect to the EPAM. #' #' #' #' The function then uses the \code{\link{mass.algorithm}} to select the optimal #' #' mass value for clustering estimation. #' #' #' #' #' #' @param mass optional, a vector of mass values. #' #' @param list.epam optional, a list of expected pairwise allocation matrices. #' #' Each matrix in the list needs the attributes "\code{mass}" and #' #' "\code{n.draws}". #' #' @param dis a dissimilarity structure of class \code{dist}. #' #' @param new.draws logical; if \code{TRUE} then new draws are obtained at each #' #' mass value. #' #' @param w a vector of length 3 of the weights to be used in the #' #' \code{\link{mass.algorithm}}. #' #' @param discount parameter of the Ewens-Pitman Attraction distribution. #' #' @param temp temperature parameter of the Ewens-Pitman Attraction #' #' distribution. #' #' @param loss One of \code{"binder"} or \code{"VI.lb"} to indicate #' #' the optimization should seek to minimize the expectation of the #' #' Binder loss (Binder 1978) or the lower bound of the expectation of the variation of #' #' information loss (Wade & Ghahramani 2017), respectively. #' #' @param n.draws number of draws of partitions to be obtained at each mass #' #' value. #' #' @param two.stage logical; if \code{TRUE}, the two stage algorithm is #' #' implemented in \code{\link{mass.algorithm}}. #' #' @param parallel logical; if \code{TRUE} computations will take advantage #' #' multiple CPU cores. #' #' @param x An object from the \code{\link{default.mass}} function. #' #' @param ... currently ignored #' #' #' #' @return An object of class \code{shallot.default.mass}. This object is a list #' #' containing a matrix of best' possible mass values to maximize partition #' #' confidence and minimize the variance ratio, the clustering estimate, the #' #' expected pairwise allocation matrix, parameters used for optimization and #' #' the EPA distribution, and the list of expected pairwise allocation matrices #' #' for each mass value. #' #' #' #' @family Default Mass Selection #' #' @importFrom salso salso #' #' @export #' #' #' default.mass <- function(mass, list.epam, dis, new.draws=TRUE, w=c(1,1,1), discount=0, temp=10, loss="binder", n.draws=100L, two.stage=TRUE, parallel=TRUE) { #' if (missing(mass)) { #' if (loss == "VI.lb" ) { #' mass <- seq(0.1,10,0.2) #' } else { #' mass <- seq(0.1,5,by=0.05) #' } #' } #' #' single.epam <- function(mass,discount,dis,n.draws,temp=10,parallel=parallel) { #' n <- nrow(as.matrix(dis)) #' epa <- ewens.pitman.attraction(mass(mass), discount(discount), attraction(permutation(n.items=n, fixed=FALSE), decay.exponential(temperature(temp), dis))) #' draws <- sample.partitions(epa, n.draws, parallel = parallel) #' as.matrix(pairwise.probabilities(draws)) #' } #' #' if(missing(list.epam) && new.draws == FALSE) stop("must provide list.epam or obtain new draws") #' if (missing(list.epam) || new.draws == TRUE) { #' out <- lapply(mass, single.epam, n.draws=n.draws, discount=discount, dis=dis, temp=temp, parallel=parallel) #' sapply(1:length(out), function(i) { #' attr(out[[i]],"mass") <<- mass[i] #' attr(out[[i]],"n.draws") <<- n.draws #' attr(out[[i]],"discount") <<- discount #' attr(out[[i]],"temperature") <<- temp #' }) #' if (!missing(list.epam)) { #' mass0 <- sapply(list.epam,attr,"mass") #' combine <- which(mass0 %in% mass) #' add <- which(!(mass %in% mass0)) #' if (length(combine) > 0) { #' sapply(combine, function(i) { #' n.draws0 <- attr(list.epam[[i]],"n.draws") #' which.out <- which(mass0[i] == mass) #' list.epam[[i]] <<- (list.epam[[i]]*n.draws0 + out[[which.out]]*n.draws)/(n.draws + n.draws0) #' attr(list.epam[[i]],"n.draws") <<- n.draws0 + n.draws #' }) #' } #' if (length(add) > 0) { #' list.epam <- c(list.epam, out[add]) #' } #' list.epam <- list.epam[order(sapply(list.epam,attr,"mass"))] #' } else { #' list.epam <- out #' } #' } #' #' out.cl <- lapply(list.epam, function(x) { #' attr(x, "mass") <- NULL #' attr(x, "discount") <- NULL #' attr(x, "n.draws") <- NULL #' attr(x, "temperature") <- NULL #' salso(x, loss=loss, parallel=parallel)$estimate
#'   })
#'
#'   mass <- sapply(list.epam, attr, "mass")
#'   weight <- sapply(list.epam, attr, "n.draws")
#'   pc <- mapply(partition.confidence, out.cl, list.epam)
#'   vr <- mapply(variance.ratio, out.cl, list.epam)
#'   ncl <- sapply(out.cl,function(X) length(unique(X)))
#'
#'   best.mass <- mass.algorithm(mass,pc,vr,ncl,w,two.stage)
#'   best.index <- which(mass == best.mass[1,"mass"])
#'
#'   output <- list(criteria=best.mass,
#'                  clustering = out.cl[[best.index]],
#'                  expectedPairwiseAllocationMatrix = list.epam[[best.index]],
#'                  params = list(discount = discount, temperature = temp, loss = loss),
#'                  algorithm = data.frame(mass=mass, pc=pc, vr=vr, nClusters=ncl, nDraws=weight),
#'                  list.epam = list.epam)
#'   class(output) <- "shallot.default.mass"
#'   output
#' }
#'
#' #' @rdname default.mass
#' #' @export
#' #'
#' print.shallot.default.mass <- function(x, ...) {
#'   cat("Clustering estimated using loss: ",x$params$loss,", with discount ",
#'       x$params$discount," and temperature ", x$params$temperature,"\n\n",sep="")
#'   cat("The selected mass value is", x$criteria[1,"mass"],"\n") #' cat("number of clusters: ",x$criteria[1,"n"],"\npartition confidence: ",x$criteria[1,"pc"],"\nvariance ratio: ", x$criteria[1,"vr"], "\n", sep="")
#'
#'   cat("\nestimated clustering:\n", x\$clustering)
#'   cat("\n")
#' }
#'
#'
#'
#'
#'


## Try the shallot package in your browser

Any scripts or data that you put into this service are public.

shallot documentation built on Nov. 10, 2020, 5:09 p.m.