R/bayesdist.R
In emuR: Main Package of the EMU Speech Database Management System

Documented in bayesplot

##' Classify using Mahalanobis distance
##' 
##' Classifies using Mahalanobis distance
##' 
##' The \code{model} argument contains the mean and inverse covariance matrix
##' (or standard deviation if the data is one-dimensional) for each class in
##' the training set as well as the class labels.  This function calculates the
##' Mahalanobis distance of each row of \code{data} from each class mean and
##' assigns the label of the closest mean to that row.  The result is a vector
##' of labels corresponding to the rows of \code{data}.
##' 
##' The Mahalanobis distance between a data point and a class is the Euclidean
##' distance between the point and the class mean divided by the covariance
##' matrix for the class.  This means that classes with large covariances will
##' attract data points from a larger area than those with small covariances.
##' 
##' @param data A vector or matrix of data
##' @param train A Gaussian model generated by \code{train}.
##' @return A label vector with one element per row of \code{data}
##' @seealso train
##' @references O'Shaughnessy, D. Speech Communication (Addison-Wesley:
##' Reading, MA. 1987)
##' @keywords misc
##' @export mahal
"mahal" <- function(data, train)
{
  #   if (emu.options("deprecated.warnings")) # commented out because glabal vars don't exist any more
  warning("mahal is deprecated, use classify with metric=\"mahal\"\n")
  classify( data, train, metric="mahal" )
}









##' bayes lab
##' 
##' see function
##' 
##' 
##' @keywords internal
##' @export bayes.lab
"bayes.lab" <- function(data, train)
{
  #if (emu.options("deprecated.warnings")) # commented out because glabal vars don't exist any more
  warning("bayes.lab is deprecated, use classify with metric=\"bayes\"\n")
  classify( data, train, metric="bayes" )
}









##' bayes dist
##' 
##' see function
##' 
##' 
##' @keywords internal
##' @export bayes.dist
"bayes.dist" <- function(data, train, labels=NULL)
{
  #if (emu.options("deprecated.warnings")) # commented out because glabal vars don't exist any more
  warning("bayes.dist is deprecated, use distance with metric=\"bayes\"\n")
  distance( data, train, labels, metric="bayes")
}









##' Calculate mahalanobis distances
##' 
##' Calculates mahalanobis distances
##' 
##' The \code{train} function finds the centroids and covariance matrices for a
##' set of data and corresponding labels: one per unique label.  This function
##' can be used to find the mahalanobis distance of every data point in a
##' dataset to each of the class centroids.  The columns of the resulting
##' matrix are marked with the label of the centroid to which they refer.  The
##' function \code{mahal} should be used if you want to find the closest
##' centroid to each data point.
##' 
##' @param data A matrix of numerical data points.
##' @param labels A vector of labels..
##' @param train A gaussian model as returned by the \code{train} function.
##' @return A matrix of distances with one column for every class (label) in
##' the gaussian model.
##' @seealso train, mahal, bayes.lab, bayes.dist
##' @keywords misc
##' @export mahal.dist
"mahal.dist" <- function( data, train, labels=NULL )
{
  #if (emu.options("deprecated.warnings")) # commented out because glabal vars don't exist any more
  #cat("mahal.dist is deprecated, use distance with metric=\"mahal\"\n") # commented out because glabal vars don't exist any more
  distance( data, train, labels, metric="mahal")
}


## generalise bayes.dist and mahal.dist








##' distance
##' 
##' see function
##' 
##' 
##' @keywords internal
##' @export distance
"distance" <- function( data, train, labels=NULL, metric="bayes" )
{
  ## data is a set of data points
  ## train is the result of the train fn and contains
  ##  $mean - the centroids
  ##  $cov  - the covariance matrices
  ##  $invcov - the inverse covariance matrix
  ##  $label - the corresponding labels
  ## labels - an optional set of labels corresponding to data
  ## metric - one of "bayes" or "mahal" for bayesian or mahalanobis distance
  
  if(!is.matrix(data)) data <- cbind(data)
  
  ncols <- length(train$label)
  ndims <- ncol(data)
  probs <- matrix(0, nrow = nrow(data), ncol = ncols)
  
  for(lab in 1:ncols) {
    tmp <- (lab - 1) * ndims + 1
    tmp1 <- tmp + ndims - 1
    cov <- train$cov[tmp:tmp1,  ]
    invcov <- train$invcov[tmp:tmp1,  ]
    if ( metric == "bayes" ) {
      probs[, lab] <- bayesian.metric(data, train$means[lab,  ], cov, invcov)
    } else if ( metric == "euclidean" ) {
      probs[, lab] <- euclidean.metric(data, train$means[lab,  ])
    } else if ( metric == "mahal" ) {
      probs[, lab] <- mahalanobis.metric(data, train$means[lab,  ], invcov)
    }            
  }
  dimnames(probs) <- list(labels, train$label)
  probs
}

"euclidean.metric" <- function( data, mean )
{
  return()
}










##' bayesian metric
##' 
##' see function
##' 
##' 
##' @keywords internal
##' @export bayesian.metric
"bayesian.metric" <- function( data, mean, cov, invcov )
{
  # calcuate the gaussian classification metric for multivariate data
  # given mean vector and covariance matrix
  det <- -log(as.numeric(prod(eigen(cov)$value)))
  x.u <- t(data) - mean
  pow <- t(x.u) %*% invcov
  ## this is really pow %*% x.u if x.u were just one point but it isn't
  ## it's many points so we can't just matrix multiply
  return( det - apply(pow * t(x.u), 1, sum) )
}









##' mahalanobis metric
##' 
##' see function
##' 
##' 
##' @keywords internal
##' @export mahalanobis.metric
"mahalanobis.metric" <- function(data, mean, invcov)
{
  x.u <- t(data) - mean
  pow <- t(x.u) %*% invcov
  ## this is really pow %*% x.u if x.u were just one point but it isn't
  ## it's many points so we can't just matrix multiply
  return( apply(pow * t(x.u), 1, sum) )
}










##' classify
##' 
##' classifies data
##' 
##' 
##' @param data data to classify
##' @param train training data
##' @param metric bayes or mahal
##' @return The classification matrix.
##' @author Jonathan Harrington
##' @keywords models
##' @examples
##' 
##' 
##' ## The function is currently defined as
##' function (data, train, metric = "bayes") 
##' {
##'     probs <- distance(data, train, metric = metric)
##'     if (metric == "bayes") {
##'         best <- apply(probs, 1, max)
##'     }
##'     else if (metric == "mahal") {
##'         best <- apply(probs, 1, min)
##'     }
##'     result <- rep("", length(best))
##'     for (lab in 1:length(train$label)) {
##'         tmp <- probs[, lab] == best
##'         result[tmp] <- train$label[lab]
##'     }
##'     result
##'   }
##' 
##' @export classify
"classify" <- function(data, train, metric="bayes")
{
  probs <- distance(data, train, metric=metric )
  
  ## what's best depends on the metric, bayes is a prob. so max is best
  ## mahal is a distance so min is best
  if( metric=="bayes" ) {
    best <- apply(probs, 1, max)
  } else if ( metric=="mahal" ) {
    best <- apply(probs, 1, min)
  }
  
  result <- rep("", length(best))
  for(lab in 1:length(train$label)) {
    tmp <- probs[, lab] == best
    result[tmp] <- train$label[lab]
  }
  result
}










##' bayesplot
##' 
##' bayesplot
##' 
##' 
##' @keywords internal
##' @export bayesplot
bayesplot <- function(data, train, N = 10, ellipse = FALSE, 
                      labs = NULL, xlab="", ylab="", colour = TRUE, ...)
{
  ## data is the original data, used for scaling
  ## train is the stuff you get from train()
  rx <- range(data[, 1])
  ry <- range(data[, 2])  
  ## make a set of points covering the plane 0,1
  points <- cbind(sort(rep(1:N/N, N)), rep(1:N/N, N))
  ## Now scale them to the data
  points[, 1] <- points[, 1] * (rx[2] - rx[1]) + rx[1]
  points[, 2] <- points[, 2] * (ry[2] - ry[1]) + ry[1]
  ## now classify each point
  blabs <- classify(points, train, metric="bayes")
  graphics::plot(points, type = "n", xlim = rx, ylim = ry, xlab=xlab, ylab=ylab)
  ulabs <- unique(blabs)
  k <- 1
  colours <- mu.colour( ulabs, colour, FALSE )$colour
  for(j in ulabs) {
    temp <- muclass(blabs, j)
    graphics::text(points[temp,  ], blabs[temp], col = colours[k])
    k <- k + 1
  }
  if(ellipse && !is.null(labs) ) {
    oldpar = graphics::par(new = TRUE)
    on.exit(graphics::par(oldpar))
    eplot(data, labs, xlim = rx, ylim = ry, colour=colour, ...)
  }
}









##' Train a Gaussian Model
##' 
##' Trains a Gaussian Model
##' 
##' This function is used to train a gaussian model on a data set. The result
##' can be passed to either the \code{mahal} or \code{bayes.lab} functions to
##' classify either the training set (\code{x}) or a test set with the same
##' number of dimensions.  Train simply finds the mean and inverse covariance
##' matrix/standard deviation for the data corresponding to each unique label
##' in labs.
##' 
##' @param x A data vector or matrix.
##' @param lab A vector of labels parallel to \code{x}. If missing, all data is
##' assumed to be from the same class.
##' @return A structure with the following components:
##' 
##' \item{label}{ The unique labels in \code{lab}. } \item{means}{ The means
##' for each dimension per unique label.  } \item{cov}{ The combined covariance
##' matrixes for each unique label. The matrixes are joined with \code{rbind}.
##' If the input data is one-dimensional, this is just the standard deviation
##' of the data.  } \item{invcov}{ The combined inverse covariance matrixes for
##' each unique label. The matrixes are joined with \code{rbind}.  If the input
##' data is one-dimensional, this is just the reciprocal of the standard
##' deviation of the data. }
##' @seealso mahal, bayes.lab, mahalplot, bayes.plot
##' @keywords misc
##' @export train
"train"<- function(x, lab=rep("x",nrow(x)))
{
  mat <- NULL
  if(is.matrix(x)){
    summeanvals <- NULL
    sumcovvals <- NULL
    sumcovvals.inv <- NULL
    for(j in unique(lab)) {
      temp <- lab == j
      # can only do this if there's more than one row
      if( sum( temp ) == 1 ) {
        ## what to do??
        ## can't do anything sensible so barf and tell them why
        stop( "\n\tData passed to train has only one entry for one of the labels.\n\tA gaussian model can't be generated for this data." )
      }
      values <- x[temp,]
      meanvals <- apply(values, 2, mean)
      covvals <- var(values, values)
      covvals.inv <- solve(covvals)
      summeanvals <- rbind(summeanvals, meanvals)
      sumcovvals <- rbind(sumcovvals, covvals)
      sumcovvals.inv <- rbind(sumcovvals.inv, covvals.inv)
    }
    mat$label <- unique(lab)
    mat$means <- summeanvals
    mat$cov   <- sumcovvals
    mat$invcov <- sumcovvals.inv
  }  else { # the one dimensional case
    mat <- NULL
    mat$means <- NULL
    mat$cov <- NULL
    for(j in unique(lab)) {
      # cat("data for ", j, " \n" )
      temp <- lab == j			
      mat$means <- c(mat$means, mean(x[temp]))
      mat$cov <- c(mat$cov, sqrt(var(x[temp])))
    }
    mat$label <- unique(lab)
    mat$invcov <- 1/mat$cov   # in fact this won't be used in the 1d case
  }
  mat
}