Nothing
#' Co-occurrence distance for binary/ categorical variables data
#'
#' @description This function calculates the co-occurrence distance proposed
#' by Ahmad and Dey (2007).
#'
#' @param data A matrix or data frame of binary/ categorical variables
#' (see \strong{Details}).
#'
#' @details This function computes co-occurrence distance, which is a binary/
#' categorical distance, that based on the other variable's distribution
#' (see \strong{Examples}). In the \strong{Examples}, we have a data set:
#'
#' \tabular{lrrr}{
#' object \tab x \tab y \tab z \cr
#' 1 \tab 1 \tab 2 \tab 2 \cr
#' 2 \tab 1 \tab 2 \tab 1 \cr
#' 3 \tab 2 \tab 1 \tab 2 \cr
#' 4 \tab 2 \tab 1 \tab 2 \cr
#' 5 \tab 1 \tab 1 \tab 1 \cr
#' 6 \tab 2 \tab 2 \tab 2 \cr
#' 7 \tab 2 \tab 1 \tab 2
#' }
#'
#' The co-occurrence distance transforms each category of binary/ categorical
#' in a variable based on the distribution of other variables, for example,
#' the distance between categories 1 and 2 in the x variable can be
#' different to the distance between categories 1 and 2 in the z
#' variable. As an example, the transformed distance between categories 1 and 2
#' in the z variable is presented.
#'
#' A cross tabulation between the z and x variables with
#' corresponding (column) proportion is
#'
#' \tabular{rrrrrr}{
#' \tab 1 \tab 2 \tab ||\tab 1 \tab 2 \cr
#' 1 \tab 2 \tab 1 \tab ||\tab 1.0 \tab 0.2 \cr
#' 2 \tab 0 \tab 4 \tab ||\tab 0.0 \tab 0.8
#' }
#'
#' A cross tabulation between the z and y variables with
#' corresponding (column) proportion is
#'
#' \tabular{rrrrrr}{
#' \tab 1 \tab 2 \tab ||\tab 1 \tab 2 \cr
#' 1 \tab 1 \tab 3 \tab ||\tab 0.5 \tab 0.6 \cr
#' 2 \tab 1 \tab 2 \tab ||\tab 0.5 \tab 0.4
#' }
#'
#' Then, the maximum values of the proportion in each row are taken such that
#' they are 1.0, 0.8, 0.6, and 0.5. The new distance between categories 1 and
#' 2 in the z variable is
#' \deqn{\delta_{1,2}^z = \frac{(1.0+0.8+0.6+0.5) - 2}{2} = 0.45}
#' The constant \eqn{2} in the formula applies because the z variable
#' depends on the 2 other variable distributions, i.e the x and y
#' variables. The new distances of each category in the
#' for the x and y variables can be calculated in a similar way.
#'
#' Thus, the distance between objects 1 and 2 is 0.45. It is only the z
#' variable counted to calculate the distance between objects 1 and 2
#' because objects 1 and 2 have similar values in both the x and y
#' variables.
#'
#' The \code{data} argument can be supplied with either a matrix or data frame,
#' in which the class of the element has to be an integer. If it is not
#' an integer, it will be converted to an integer class. If the \code{data}
#' of a variable only, a simple matching is calculated. The co-occurrence
#' is absent due to its dependency to the distribution of other variables
#' and a \code{warning} message appears.
#'
#' @return Function returns a distance matrix (n x n).
#'
#' @author Weksi Budiaji \cr Contact: \email{budiaji@untirta.ac.id}
#'
#' @references Ahmad, A., and Dey, L. 2007. A K-mean clustering algorithm for
#' mixed numeric and categorical data. Data and Knowledge Engineering 63,
#' pp. 503-527.
#' @references Harikumar, S., PV, S., 2015. K-medoid clustering for heterogeneous data sets.
#' JProcedia Computer Science 70, 226-237.
#'
#' @examples
#' set.seed(1)
#' a <- matrix(sample(1:2, 7*3, replace = TRUE), 7, 3)
#' cooccur(a)
#'
#'
#' @export
cooccur <- function(data) {
if((is.matrix(data)||is.data.frame(data))==FALSE)
stop("The data must be a matrix or a data frame!")
rn <- rownames(data)
if(is.logical(data)==TRUE) data <- data.matrix(as.data.frame(data)) + as.integer(1)
if(is.numeric(data)==TRUE) data <- apply(data, 2, function(x) as.integer(x))
if(is.data.frame(data)==TRUE) data <- apply(data, 2, function(x) as.integer(as.factor(x)))
col <- ncol(data)
if (col != 1) {
newdist <- function(data, col = col, colnum){
nvar <- 1:col
n <- length(levels(as.factor(data[,colnum])))
var <- prob <- vector("list", (col-1))
for (i in 1:(col-1)) {
var[[i]] <- table(data[,nvar[-colnum][i]],data[,colnum])
prob[[i]] <- var[[i]]/matrix(colSums(var[[i]]),
nrow=nrow(var[[i]]), ncol = ncol(var[[i]]),
byrow = TRUE)
}
probmat <- do.call(rbind,prob)
matnew <- matrix(0, nrow = n, ncol = n)
rownames(matnew) <- colnames(matnew) <- 1:n
for (i in 1:n) {
for (j in 1:n) {
matnew[i,j] <- (sum(apply(probmat[,c(i,j)], 1, max))-(col-1))/(col-1)
}
}
return(matnew)
}
newdata <- vector("list", col)
for (i in 1:col) {
newdata[[i]] <- newdist(data, col=col, i)
}
distmat <- matrix(0, nrow(data), nrow(data))
for (i in 1:nrow(data)) {
for (j in 1:nrow(data)){
distsum <- numeric(col)
for (k in 1:col) {
distsum[k] <- newdata[[k]][data[i, k], data[j, k]]
}
distmat[i, j] <- sum(distsum)
}
}
rownames(distmat) <- colnames(distmat) <- rn
} else {
distmat <- kmed::matching(data, data)
rownames(distmat) <- colnames(distmat) <- rn
warning('Due to only 1 variable, simple matching distance is calculated instead!
To produce coocurrence distance, it requires at least 2 variables.')
}
return(distmat)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.