R/greenclust.R

Defines functions .print.step.v1 .combine.rows .greenclust.v1 .print.step .calculate.row.chi greenclust

Documented in greenclust

###############################################################################
#                                                                             #
#  greenclust                                                                 #
#                                                                             #
#  Part of the greenclust R package                                           #
#                                                                             #
#  Jeff Jetton                                                                #
#  March 2019 - Initial coding                                                #
#  September 2019 - version 1.1: Improved clustering implementation           #
#                                                                             #
###############################################################################



#' Row Clustering Using Greenacre's Method
#'
#' Iteratively collapses the rows of a table (typically a contingency table)
#' by selecting the pair of rows each time whose combination creates the
#' smalled loss of chi-squared.
#'
#' @param x a numeric matrix or data frame
#' @param correct a logical indicating whether to apply a continuity
#'   correction if and when the clustered table reaches a 2x2 dimension.
#' @param verbose if TRUE, prints the clustered table along with r-squared and
#'   p-value at each step
#' @return An object of class \code{greenclust} which is compatible with most
#'   \code{\link{hclust}} object functions, such as \code{\link{plot}()} and
#'   \code{\link{rect.hclust}()}. The height vector represents the proportion
#'   of chi-squared, relative to the original table, seen at each clustering
#'   step. The greenclust object also includes a vector for the chi-squared
#'   test p-value at each step and a boolean vector indicating whether the
#'   step had a tie for "winner".
#' @references Greenacre, M.J. (1988) "Clustering the Rows and Columns of
#'   a Contingency Table," \emph{Journal of Classification 5}, 39-51.
#'   \url{https://doi.org/10.1007/BF01901670}
#' @seealso \code{\link{greencut}}, \code{\link{greenplot}},
#'     \code{\link{assign.cluster}}
#' @examples
#' # Combine Titanic passenger attributes into a single category
#' tab <- t(as.data.frame(apply(Titanic, 4:1, FUN=sum)))
#' # Remove rows with all zeros
#' tab <- tab[apply(tab, 1, sum) > 0, ]
#'
#' # Perform clustering on contingency table
#' grc <- greenclust(tab)
#'
#' # Plot r-squared and p-values for each potential cut point
#' greenplot(grc)
#'
#' # Get clusters at suggested cut point
#' clusters <- greencut(grc)
#'
#' # Plot dendrogram with clusters marked
#' plot(grc)
#' rect.hclust(grc, max(clusters))
#'
#' @export
#' @importFrom stats chisq.test as.dendrogram order.dendrogram
#'             aggregate cutree pchisq
greenclust <- function(x, correct=FALSE, verbose=FALSE) {

    #######################################
    #   Validation Checks and Data Prep   #
    #######################################

    # Check for valid arguments
    if (anyNA(x) || is.null(x) || !(is.matrix(x) || is.data.frame(x)))
        stop("x must be a non-null matrix or data frame")
    # If x is a dataframe, convert to matrix
    if (is.data.frame(x)) {
        x = as.matrix(x)
    }
    if (!is.numeric(x) || sum(is.nan(x)) != 0)
        stop("x must be numeric")
    if (any(x < 0) || sum(is.infinite(x)) != 0)
        stop("all elements must be nonnegative and finite")
    if (nrow(x) < 3)
        stop("x should have at least 3 rows")
    if (ncol(x) < 2)
        stop("x should have at least 2 columns")
    if(sum(apply(x, 1, sum)==0) > 0)
        stop("all row totals must be greater than zero")
    if(sum(apply(x, 2, sum)==0) > 0)
        stop("all column totals must be greater than zero")

    # If there are no row names, give them names
    if (is.null(rownames(x))) {
        rownames(x) <- 1:nrow(x)
    }


    ################################
    #   Preliminary calculations   #
    ################################

    # Pre-calculate the column totals and overall table total
    # These will never change, so we just have to do this once
    column.totals <- apply(x, MARGIN=2, FUN=sum)
    table.total <- sum(column.totals)

    # Keep track of all currently "active" row indices. That is, the
    # rows that would still be part of the table at each iteration,
    # ignoring the rows that have already been combined into a new row,
    # and adding any newly-combined rows. This gives us a vector to cycle
    # through each time when calculating potential reductions in chi for
    # all the various active row combinations (and keeps us from having
    # to actually combine the rows into a new table each time).
    active.rows <- 1:nrow(x)

    # Store current degrees of freedom and initial row count
    df <- prod(dim(x) -  1)
    n <- nrow(x)

    # Store row chi values for every row in the table
    row.chis <- apply(x, MARGIN=1, FUN=.calculate.row.chi,
                      column.totals, table.total)

    # Sum of row chis is the overall chi-squared statistic
    # Remember this value for later r-squared ("height") calculations
    initial.chi <- sum(row.chis)
    running.chi <- initial.chi

    # Initialize the "merge matrix" (record of each merge step) and the vectors
    # that will be used to construct our "greenclust" object at the end
    merge.matrix <- vector()
    heights <- vector()
    p.values <- vector()
    tie <- vector()

    # Create a data.frame to store the chi "reduction" calculations, i.e.,
    # the amount by which the table's orver chi statistic would drop for
    # every potential two-row combination left to consider
    reductions <- data.frame(i=vector("integer"),
                             j=vector("integer"),
                             reduction=vector("numeric"))

    # Perform the initial reduction calculations for the entire table
    for (i in 1:(n-1)) {
        # Check combinations of row i with every row higher than i...
        for (j in (i+1):n) {
            # offset in 1:(n-i)
            # j = i + offset
            # If we combined these two rows, what row chi would result?
            combined.row.chi <- .calculate.row.chi(x[i, ] + x[j, ],
                                                   column.totals, table.total)
            # How much of a reduction is that, compared to the separate rows?
            separate.row.chi.total <- row.chis[i] + row.chis[j]
            chi.reduction <-  separate.row.chi.total - combined.row.chi
            # If this is negative, that's only due to floating-point error.
            # (It's not possible for a combination of rows to increase chi.)
            # Adjust to zero...
            if (chi.reduction < 0) {
                chi.reduction <- 0
            }
            # Store the results for this row combo
            reductions <- rbind(reductions,
                                data.frame(i=i, j=j, reduction=chi.reduction),
                                make.row.names=FALSE)
        }
    }


    ################################
    #   Clustering                 #
    ################################

    for (cluster.number in 1:(n-1)) {

        # Get the current-lowest chi reduction value
        lowest.reduction <- min(reductions$reduction)

        # Do we have more than one with that lowest reduction?
        tie.flag <- nrow(reductions[reductions$reduction==lowest.reduction,
                                    ]) > 1
        # Winner is the first combo with the lowest reduction
        index.of.winning.reduction <- match(lowest.reduction,
                                            reductions$reduction)

        # Combining the "winner" combo...
        # We don't literally combine the two rows in our dataset, yielding a
        # new dataset with one fewer row (like version 1.0 did), since that
        # would throw off all of the row references in our pre-calcuated data.
        # Instead we create the combination as a new row in matrix x, with a
        # new chi value in row.chis.
        # We don't delete the old rows, but instead effectively ignore them
        # by deleting them from the active.rows vector and removing any row
        # in reduction that references either one.
        # Then we add new rows to reduction based on comparing our new,
        # freshly-clustered row with all the remaining ones.

        # Combine the winning rows and add the result to the end of x
        rows.to.combine <- c(reductions[index.of.winning.reduction, "i"],
                             reductions[index.of.winning.reduction, "j"])
        new.row <- x[rows.to.combine[1], ] + x[rows.to.combine[2], ]
        x <- rbind(x, new.row)
        new.row.index <- nrow(x)

        # Remove the indices of the old rows from active.rows
        active.rows <- active.rows[!(active.rows %in% rows.to.combine)]

        # Add the chi of our new row to row.chis
        new.row.chi <- .calculate.row.chi(new.row, column.totals, table.total)
        row.chis <- c(row.chis, new.row.chi)

        # Reduce the running overall chi by the winning reduction amount
        running.chi <- running.chi - lowest.reduction

        # Reduce degrees of freedom
        df <- df - (ncol(x) - 1)

        # Remove reductions that refer to the old (now non-active) rows
        # (A bit verbose, but slightly faster than using %in%, etc.)
        reductions <- reductions[reductions$i != rows.to.combine[1] &
                                     reductions$j != rows.to.combine[1] &
                                     reductions$i != rows.to.combine[2] &
                                     reductions$j != rows.to.combine[2], ]

        # Calculate chi reductions for the new row compared to each of
        # the other, older, active rows.
        for (comp.row.index in active.rows) {
            combined.row.chi <- .calculate.row.chi(new.row +
                                                       x[comp.row.index, ],
                                                   column.totals,
                                                   table.total)
            separate.row.chi.total <- new.row.chi + row.chis[comp.row.index]
            chi.reduction <-  separate.row.chi.total - combined.row.chi
            if (chi.reduction < 0) {
                chi.reduction <- 0
            }
            reductions <- rbind(reductions,
                                data.frame(i=new.row.index,
                                           j=comp.row.index,
                                           reduction=chi.reduction),
                                make.row.names=FALSE)
        }

        # Add the new cluster row's index to active.rows
        active.rows <- c(active.rows, new.row.index)

        # Add info on winning clustering to the merge matrix
        merge.matrix <- rbind(merge.matrix, rows.to.combine)
        # Rows from original table should have a negative index.
        # Clustered rows should start at 1. Adjust accordingly...
        for (i in 1:2) {
            if (rows.to.combine[i] > n) {
                merge.matrix[cluster.number, i] <- rows.to.combine[i] - n
            }
            else {
                merge.matrix[cluster.number, i] <- -rows.to.combine[i]
            }
        }

        # Add info to the other cluster-step-tracking variables
        heights <- c(heights, (initial.chi - running.chi)/initial.chi)
        p.values <- c(p.values, pchisq(running.chi, df, lower.tail=FALSE))
        tie <- c(tie, tie.flag)

        # Print details (if we're in verbose mode)
        if (verbose && cluster.number < (n - 1)) {
            # The new clusters should be named if we're going to show them
            rownames(x)[new.row.index] <- paste("Cluster", cluster.number)
            .print.step(x[active.rows, ], cluster.number,
                        rownames(x)[rows.to.combine], tie.flag, running.chi,
                        p.values[cluster.number], initial.chi)
        }

    }  # End of cluster loop:  for (cluster.number in 1:(n-1)) {


    ################################
    #   Prepare Return Object      #
    ################################

    # Get rid of the merge matrix row names added by rbind
    row.names(merge.matrix) <- NULL

    # Final height should always be 1 (for plotting)
    heights[length(heights)] <- 1

    # Final p.value is not meaningful
    p.values <- p.values[-length(p.values)]

    # Pack everything up (except the order vector)
    grc <- structure(list(merge=merge.matrix,
                          height=heights,
                          order=vector(),
                          labels=rownames(x[1:n, ]),
                          call=match.call(),
                          dist.method="chi-squared",
                          p.values=p.values,
                          tie=tie),
                     class=c("hclust", "greenclust"))

    # Use R's built-in dendrogram object to help create the order vector
    grc$order <- order.dendrogram(as.dendrogram(grc))

    # If we want to maintain R's default use of Yate's continuity correction
    # on a 2x2 table (although I can't imagine why), recalculate the second-
    # to-last cluster's chi & p. A bit of a hack to do this at the end, but
    # it usually saves a few cycles compared to checking at every clustering
    # step to set it this way in the first place, especially considering this
    # will rarely ever be done.
    if (correct & ncol(x) == 2) {
        clusters <- cutree(grc, k=2)
        clustered.table <- aggregate(x[1:n,], by=list(clusters), FUN=sum)[,-1]
        suppressWarnings(chi <- chisq.test(clustered.table, correct=TRUE))
        grc$height[n-2] <- (1 - chi$statistic/initial.chi)
        grc$p.values[n-2] <- chi$p.value
    }

    # That's it!
    return(grc)
}



#########  "Hidden" helper functions  #########################################

# Calculate a single row's contribution to total chi.
#
#             r: numeric vector of one row's cells
# column.totals: numeric vector of the table's column totals (assumed to be
#                the same length as r... this is not checked!)
#   table.total: total sum of entire table
#
.calculate.row.chi <- function(r, column.totals, table.total) {

    # Determine expected cell values by distributing the row total
    # in proportion to the column totals
    expected.values <- column.totals * sum(r) / table.total
    # Each cell's chi value is the squared difference between
    # expected and actual, standardized by dividing by expected
    cell.chis <- (expected.values - r)^2 / expected.values

    # And the row chi is just the sum of cell chis...
    return(sum(cell.chis))
}



# Display information about a particular clustering step
# Called by greenclust() when verbose==TRUE
#
#           x: matrix at this step (with row names)
#           n: cluster step number
# combo.names: character vector of the names of the two rows
#              that were combined at this step
#         tie: were there tied chi-squares at this step?
#       chisq: chi-squared statistic of this step's matrix
#           p: p-value of above chi-squared test
# initial.chi: chi-squared of original (unclustered) matrix,
#              used to calculate r-squared
#
.print.step <- function(x, n, combo.names, tie, chisq, p, initial.chi) {

    # Display step information
    cat(paste0("Step ", n, ":  "))
    cat(paste(combo.names, collapse=" + "))
    if (tie) {
        cat(" (tie)\n\n")
    } else {
        cat("\n\n")
    }

    # Show current table/matrix
    print(x)

    # Display stats
    cat(paste("\nChi-squared:", round(chisq, 2), "\n"))
    cat(paste("p-value:", signif(p, 4), "\n"))
    cat(paste("R-squared:", round(chisq/initial.chi, 4), "\n\n\n"))

    # Make sure it prints...
    utils::flush.console()
}





###############################################################################



# Legacy version of greenclust.
#
# Call using this syntax:  greenclust:::.greenclust.v1()
#
# Same function interface as the current version. Uses a far less-efficient
# clustering method, but has essentially the same output. Some tied and
# nearly-tied row combinations may cluster in a different order from the
# current (version 1.1) of the function.
#
# Maintained in case exact reproducability with version 1.0 is still needed,
# and to perfom speed comparisons with version 1.1.
#
#  *** This function is deprecated and may be removed in future versions ***
#
.greenclust.v1 <- function(x, correct=FALSE, verbose=FALSE) {

    # Check for valid arguments
    if (anyNA(x) || is.null(x) || !(is.matrix(x) || is.data.frame(x)))
        stop("x must be a non-null matrix or data frame")
    # If x is a dataframe, convert to matrix
    if (is.data.frame(x)) {
        x = as.matrix(x)
    }
    if (!is.numeric(x) || sum(is.nan(x)) != 0)
        stop("x must be numeric")
    if (any(x < 0) || sum(is.infinite(x)) != 0)
        stop("all elements must be nonnegative and finite")
    if (nrow(x) < 3)
        stop("x should have at least 3 rows")
    if (ncol(x) < 2)
        stop("x should have at least 2 columns")
    if(sum(apply(x, 1, sum)==0) > 0)
        stop("all row totals must be greater than zero")
    if(sum(apply(x, 2, sum)==0) > 0)
        stop("all column totals must be greater than zero")

    # If there are no row names, give them names
    if (is.null(rownames(x))) {
        rownames(x) <- 1:nrow(x)
    }

    # Remember chi-squared for the initial, un-clustered matrix
    suppressWarnings(initial.chi <- chisq.test(x, correct=correct)$statistic)
    # Replace row names with negative row numbers, for building merge matrix
    saved.names <- rownames(x)
    n <- nrow(x)
    rownames(x) <- -(1:n)
    # Initialize merge matrix and object vectors
    merge.matrix <- vector()
    heights <- vector()
    p.values <- vector()
    tie <- vector()

    # Main clustering loop. Iterates over each clustering decision step.
    for (cluster.number in 1:(n-1)) {

        # Figure out how many row combinations we'll try out this iteration
        trials <- choose(nrow(x), 2)
        # Track the best chi-square, as well as the p-value and rows combined
        # that go with that best chi-square trial
        best.chi <- -Inf
        best.p <- NA
        best.i <- NA
        best.j <- NA
        best.trial <- NA
        trial.num <- 1

        # Loop over each combination of two rows in current version of matrix
        for (i in 1:(nrow(x)-1)) {
            # Combine row i with every row higher than i...
            for (offset in 1:(nrow(x)-i)) {
                j <- i + offset
                trial.x <- .combine.rows(x, i, j, cluster.number)
                # Calculate new chi-sq test results
                new.chi <- suppressWarnings(chisq.test(trial.x,
                                                       correct=correct))
                # Best so far?
                if (new.chi$statistic > best.chi) {
                    best.chi <- new.chi$statistic
                    best.p <- new.chi$p.value
                    best.i <- i
                    best.j <- j
                    best.trial <- trial.num
                    tie.flag <- FALSE
                } else if (new.chi$statistic == best.chi) {
                    tie.flag <- TRUE
                }
                # On to the next trial...
                trial.num <- trial.num + 1
            }
        }

        # Add info on winning clustering to our tracking variables
        merge.matrix <- rbind(merge.matrix,
                              as.numeric(rownames(x)[c(best.i, best.j)]))
        heights <- c(heights, (initial.chi - best.chi)/initial.chi)
        p.values <- c(p.values, best.p)
        tie <- c(tie, tie.flag)

        # Change our matrix to the winning combined row matrix
        if (best.trial==trials) {
            x <- trial.x
        } else {
            x <- .combine.rows(x, best.i, best.j, cluster.number)
        }

        if (verbose && cluster.number < (n - 1)) {
            .print.step.v1(x, saved.names, cluster.number, tie.flag,
                           best.chi, best.p, initial.chi)
        }
    }

    # Final height should always be 1 (for plotting)
    heights[length(heights)] <- 1
    # Remove the height names given by the chisq.test function
    names(heights) <- NULL

    # Final p.value is not meaningful
    p.values <- p.values[-length(p.values)]

    # Pack everything up (except the order vector)
    grc <- structure(list(merge=merge.matrix,
                          height=heights,
                          order=vector(),
                          labels=saved.names,
                          call=match.call(),
                          dist.method="chi-squared",
                          p.values=p.values,
                          tie=tie),
                     class=c("hclust", "greenclust"))

    # Use dendrogram object to help create the order vector
    grc$order <- order.dendrogram(as.dendrogram(grc))

    return(grc)
}


# Return a new matrix that sums rows r1 and r2 of matrix m
# combo.name is the name to give the newly-combined row
# For speed, no checks are done. Ensure that:
#    * Index r1 is less than r2
#    * Both are valid rows in range of m
#
# This is called by .greenclust.v1() and is deprecated as of v1.1
#
.combine.rows <- function(m, r1, r2, combo.name) {
    # Copy matrix without row r1, preserving dimensions
    # (i.e., not letting it become a vector)
    new.m <- m[-r1, , drop=FALSE]
    # Replace row r2 (which is now at row r2-1) with combination
    combrow <- r2 - 1
    new.m[combrow, ] <- apply(m[c(r1, r2), ], 2, sum)
    row.names(new.m)[combrow] <- combo.name
    return(new.m)
}


# Display information about a particular clustering step
# Called by greenclust() when verbose==TRUE
#
#           x: final matrix at this step
#  orig.names: text to display when row name < 0
#           n: cluster step number
#    tie.flag: were there tied chi-squares at this step?
#       chisq: chi-squared statistic of this step's matrix
#           p: p-value of above chi-squared test
# initial.chi: chi-squared of original (unclustered) matrix,
#              used to calculate r-squared
#
# This is called by .greenclust.v1() and is deprecated as of v1.1
#
.print.step.v1 <- function(x, orig.names, n, tie.flag, chisq, p, initial.chi) {
    # Translate negative row names to original text and append
    # non-negative row names with "Cluster "
    rnames <- rownames(x)
    name.is.neg <- as.numeric(rnames) < 0
    rnames[name.is.neg] <- orig.names[-as.numeric(rnames[name.is.neg])]
    rnames[!name.is.neg] <- paste("Cluster", rnames[!name.is.neg])
    rownames(x) <- rnames
    # Display step information
    cat(paste("Step:", n))
    if (tie.flag) {
        cat(" (tie)\n")
    } else {
        cat("\n")
    }
    print(x)
    cat(paste("\nChi-squared:", round(chisq, 2), "\n"))
    cat(paste("p-value:", signif(p, 4), "\n"))
    cat(paste("R-squared:", round(chisq/initial.chi, 4), "\n\n\n"))
    utils::flush.console()
}
JeffJetton/greenclust documentation built on Oct. 27, 2019, 9:36 a.m.