R/util.R

Defines functions as.densityMclust.Mclust as.densityMclust.default as.densityMclust as.Mclust.densityMclust as.Mclust.default as.Mclust catwrap nclass.numpy hdrlevels covw errorBars unchol traceW shapeO dmvnorm partuniq partconv softmax logsumexp randomOrthogonalMatrix BrierScore unmap map mapClass classError

Documented in as.densityMclust as.densityMclust.default as.densityMclust.Mclust as.Mclust as.Mclust.default as.Mclust.densityMclust BrierScore classError covw dmvnorm errorBars hdrlevels logsumexp map mapClass partconv partuniq randomOrthogonalMatrix shapeO softmax traceW unchol unmap

adjustedRandIndex <- function (x, y) 
{
  x <- as.vector(x)
  y <- as.vector(y)
  if(length(x) != length(y)) 
    stop("arguments must be vectors of the same length")
  tab <- table(x,y)
  if(all(dim(tab)==c(1,1))) return(1)
  a <- sum(choose(tab, 2))
  b <- sum(choose(rowSums(tab), 2)) - a
  c <- sum(choose(colSums(tab), 2)) - a
  d <- choose(sum(tab), 2) - a - b - c
  ARI <- (a - (a + b) * (a + c)/(a + b + c + d)) /
    ((a + b + a + c)/2 - (a + b) * (a + c)/(a + b + c + d))
  return(ARI)
}


classError <- function(classification, class)
{
  q <- function(map, len, x)
  {
    x <- as.character(x)
    map <- lapply(map, as.character)
    y <- sapply(map, function(x) x[1])
    best <- y != x
    if(all(len) == 1)
      return(best)
    errmin <- sum(as.numeric(best))
    z <- sapply(map, function(x) x[length(x)])
    mask <- len != 1
    counter <- rep(0, length(len))
    k <- sum(as.numeric(mask))
    j <- 0
    while(y != z) 
    {
      i <- k - j
      m <- mask[i]
      counter[m] <- (counter[m] %% len[m]) + 1
      y[x == names(map)[m]] <- map[[m]][counter[m]]
      temp <- y != x
      err <- sum(as.numeric(temp))
      if(err < errmin) 
      {
        errmin <- err
        best <- temp
      }
      j <- (j + 1) %% k
    }
    best
  }
  if(any(isNA <- is.na(classification))) 
  {
    classification <- as.character(classification)
    nachar <- paste(unique(classification[!isNA]),collapse="")
    classification[isNA] <- nachar
  }
  MAP <- mapClass(classification, class)
  len <- sapply(MAP[[1]], length)
  if(all(len) == 1) 
  {
    CtoT <- unlist(MAP[[1]])
    I <- match(as.character(classification), names(CtoT), nomatch= 0)
    one <- CtoT[I] != class
  } else 
  {
    one <- q(MAP[[1]], len, class)
  }
  len <- sapply(MAP[[2]], length)
  if(all(len) == 1) 
  {
    TtoC <- unlist(MAP[[2]])
    I <- match(as.character(class), names(TtoC), nomatch = 0)
    two <- TtoC[I] != classification
  } else 
  {
    two <- q(MAP[[2]], len, classification)
  }
  err <- if(sum(as.numeric(one)) > sum(as.numeric(two))) 
            as.vector(one) else as.vector(two)
  bad <- seq(along = classification)[err]
  list(misclassified = bad, errorRate = length(bad)/length(class))
}

mapClass <- function(a, b)
{
  l <- length(a)
  x <- y <- rep(NA, l)
  if(l != length(b)) 
  {
    warning("unequal lengths")
    return(x)
  }
  # LS: new - check if both a & b are factors or character vectors
  # with the same levels then assume they are known classes and 
  # match by level names
  if(is.factor(a) & is.factor(b) & nlevels(a) == nlevels(b))
  {
    aTOb <- as.list(levels(b))
    names(aTOb) <- levels(a)
    bTOa <- as.list(levels(a))
    names(bTOa) <- levels(b)
    out <- list(aTOb = aTOb, bTOa = bTOa)
    return(out)
  }
  if(is.character(a) & is.character(b) & 
     length(unique(a)) == length(unique(b)))
  {
    aTOb <- as.list(unique(b))
    names(aTOb) <- unique(a)
    bTOa <- as.list(unique(a))
    names(bTOa) <- unique(b)
    out <- list(aTOb = aTOb, bTOa = bTOa)
    return(out)
  }
  # otherwise match by closest class correspondence
  Tab <- table(a, b)
  Ua <- dimnames(Tab)[[1]]
  Ub <- dimnames(Tab)[[2]]
  aTOb <- rep(list(Ub), length(Ua))
  names(aTOb) <- Ua
  bTOa <- rep(list(Ua), length(Ub))
  names(bTOa) <- Ub
  #
  k <- nrow(Tab)
  Map <- rep(0, k)
  Max <- apply(Tab, 1, max)
  for(i in 1:k)
  {
    I <- match(Max[i], Tab[i,  ], nomatch = 0)
    aTOb[[i]] <- Ub[I]
  }
  if(is.numeric(b))
    aTOb <- lapply(aTOb, as.numeric)
  #
  k <- ncol(Tab)
  Map <- rep(0, k)
  Max <- apply(Tab, 2, max)
  for(j in (1:k)) {
    J <- match(Max[j], Tab[, j])
    bTOa[[j]] <- Ua[J]
  }
  if(is.numeric(a))
    bTOa <- lapply(bTOa, as.numeric)
  #
  out <- list(aTOb = aTOb, bTOa = bTOa)
  return(out)
}

map <- function(z, warn = mclust.options("warn"), ...)
{
  nrowz <- nrow(z)
  cl <- numeric(nrowz)
  I <- 1:nrowz
  J <- 1:ncol(z)
  for(i in I) 
     { cl[i] <- (J[z[i,  ] == max(z[i,  ])])[1] }
  if(warn) 
    { K <- as.logical(match(J, sort(unique(cl)), nomatch = 0))
      if(any(!K))
        warning(paste("no assignment to", paste(J[!K], 
                      collapse = ",")))
  }
  return(cl)
}

unmap <- function(classification, groups=NULL, noise=NULL, ...)
{
  # converts a classification to conditional probabilities
  # classes are arranged in sorted order unless groups is specified
  # if a noise indicator is specified, that column is placed last
  n <- length(classification)
  u <- sort(unique(classification))
  if(is.null(groups))
    { groups <- u }
  else 
    { if(any(match( u, groups, nomatch = 0) == 0)) 
      stop("groups incompatible with classification")
      miss <- match( groups, u, nomatch = 0) == 0
    }
  cgroups <- as.character(groups)
  if(!is.null(noise)) 
    { noiz <- match( noise, groups, nomatch = 0)
      if(any(noiz == 0)) stop("noise incompatible with classification")
      groups <- c(groups[groups != noise],groups[groups==noise])
      noise <- as.numeric(factor(as.character(noise), levels = unique(groups)))
  }
  groups <- as.numeric(factor(cgroups, levels = unique(cgroups)))
  classification <- as.numeric(factor(as.character(classification), levels = unique(cgroups)))
  k <- length(groups) - length(noise)
  nam <- levels(groups)
  if(!is.null(noise)) 
    { k <- k + 1
      nam <- nam[1:k]
      nam[k] <- "noise"
  }
  z <- matrix(0, n, k, dimnames = c(names(classification),nam))
  for(j in 1:k)
     { z[classification == groups[j], j] <- 1 }
  return(z)
}

BrierScore <- function(z, class)
{
  z <- as.matrix(z)
  z <- sweep(z, 1, STATS = rowSums(z), FUN = "/")
  cl <- unmap(class, groups = if(is.factor(class)) levels(class) else NULL)
  if(any(dim(cl) != dim(z)))
    stop("input arguments do not match!")
  sum((cl-z)^2)/(2*nrow(cl))
}

orth2 <- function (n) 
{
  u <- rnorm(n)
  u <- u/vecnorm(u)
  v <- rnorm(n)
  v <- v/vecnorm(v)
  Q <- cbind(u, v - sum(u * v) * u)
  dimnames(Q) <- NULL
  Q
}

randomOrthogonalMatrix <- function(nrow, ncol, n = nrow, d = ncol, seed = NULL)
{
# Generate a random orthogonal basis matrix of dimension (nrow x ncol) using 
# the algorithm in
# Heiberger R. (1978) Generation of random orthogonal matrices. JRSS C, 27,
#   199-206.

  if(!is.null(seed)) set.seed(seed)
  if(missing(nrow) & missing(n)) stop()
  if(missing(nrow)) 
  {
    warning("Use of argument 'n' is deprecated. Please use 'nrow'")
    nrow <- n
  }
  if(missing(ncol) & missing(d)) stop()
  if(missing(ncol)) 
  {
    warning("Use of argument 'd' is deprecated. Please use 'ncol'")
    ncol <- d
  }
  
  Q <- qr.Q(qr(matrix(rnorm(nrow*ncol), nrow = nrow, ncol = ncol)))
  return(Q)
}

# TODO: to be removed at a certain point
# logsumexp_old <- function(x)
# { 
# # Numerically efficient implementation of log(sum(exp(x)))
#   max <- max(x)
#   max + log(sum(exp(x-max)))
# }

logsumexp <- function(x, v = NULL)
{
# Numerically efficient implementation of 
#   log-sum-exp(x_i+v)  for i = 1,...,n
# via Fortran call
  x <- if(is.null(dim(x))) matrix(x, nrow = 1) else as.matrix(x)
  n <- nrow(x)
  k <- ncol(x)
  v <- if(is.null(v)) double(k) else as.vector(v)
  stopifnot(length(v) == k)
  .Fortran("logsumexp", 
           x = x, n = as.integer(n), k = as.integer(k), v = v, 
           lse = double(n))$lse
}

softmax <- function(x, v = NULL)
{
# Efficiently computes softmax function based on 
#   log-sum-exp(x_i+v) for i = 1,...,n
# via Fortran call such that the output matrix z (n x k) has 
# rowsum(z_j) = 1 for j = 1,...,k
# 
  x <- if(is.null(dim(x))) matrix(x, nrow = 1) else as.matrix(x)
  n <- nrow(x)
  k <- ncol(x)
  v <- if(is.null(v)) double(k) else as.vector(v)
  stopifnot(length(v) == k)
  
  z <- .Fortran("softmax", 
                x = x, n = as.integer(n), k = as.integer(k), 
                v = as.double(v), lse = double(n), z = double(n * k))$z
  z <- matrix(z, nrow = n, ncol = k)
  return(z)
}

partconv <- function(x, consec = TRUE)
{
  n <- length(x)
  y <- numeric(n)
  u <- unique(x)
  if(consec) {
    # number groups in order of first row appearance
    l <- length(u)
    for(i in 1:l)
      y[x == u[i]] <- i
  }
  else {
    # represent each group by its lowest-numbered member
    for(i in u) {
      l <- x == i
      y[l] <- (1:n)[l][1]
    }
  }
  y
}

partuniq <- function(x)
{
  # finds the classification that removes duplicates from x
  charconv <- function(x, sep = "001")
  {
    if(!is.data.frame(x)) x <- data.frame(x)
    do.call("paste", c(as.list(x), sep = sep))
  }
  
  n <- nrow(x)
  x <- charconv(x)
  k <- duplicated(x)
  partition <- 1.:n
  partition[k] <- match(x[k], x)
  partition
}

dmvnorm <- function(data, mean, sigma, log = FALSE) 
{
  data <- as.matrix(data)
  n <- nrow(data)
  d <- ncol(data)
  if(missing(mean))
    mean <- rep(0, length = d)
  mean <- as.vector(mean)
  if(length(mean) != d)
    stop("data and mean have non-conforming size")
  if(missing(sigma)) 
    sigma <- diag(d)
  sigma <- as.matrix(sigma)
  if(ncol(sigma) != d) 
    stop("data and sigma have non-conforming size")
  if(max(abs(sigma - t(sigma))) > sqrt(.Machine$double.eps))
    stop("sigma must be a symmetric matrix")
  
  # - 1st approach
  # cholsigma <- chol(sigma)
  # logdet <- 2 * sum(log(diag(cholsigma)))
  # md <- mahalanobis(data, center = mean, 
  #                   cov = chol2inv(cholsigma), inverted = TRUE)
  # logdens <- -(ncol(data) * log(2 * pi) + logdet + md)/2
  #
  # - 2nd approach
  # cholsigma <- chol(sigma)
  # logdet <- 2 * sum(log(diag(cholsigma)))
  # mean <- outer(rep(1, nrow(data)), as.vector(matrix(mean,d)))
  # data  <- t(data - mean)
  # conc <- chol2inv(cholsigma)
  # Q <- colSums((conc %*% data)* data)
  # logdens <- as.vector(Q + d*log(2*pi) + logdet)/(-2)
  #
  # - 3rd approach (via Fortran code)
  logdens <- .Fortran("dmvnorm", 
                      as.double(data),  # x
                      as.double(mean),  # mu
                      as.double(sigma), # Sigma
                      as.integer(n),    # n
                      as.integer(d),    # p
                      double(d),        # w
                      double(1),        # hood
                      double(n),        # logdens
                      PACKAGE = "mclust")[[8]]
  #  
  if(log) logdens else exp(logdens)
}

shapeO <- function(shape, O, transpose = FALSE)
{
  dimO <- dim(O)
  if(dimO[1] != dimO[2])
    stop("leading dimensions of O are unequal")
  if((ldO <- length(dimO)) != 3) {
    if(ldO == 2) {
      dimO <- c(dimO, 1)
      O <- array(O, dimO)
    }
    else stop("O must be a matrix or an array")
  }
  l <- length(shape)
  if(l != dimO[1])
    stop("dimension of O and length s are unequal")
  storage.mode(O) <- "double"
  .Fortran("shapeo",
           as.logical(transpose),
           as.double(shape),
           O,
           as.integer(l),
           as.integer(dimO[3]),
           double(l * l),
           integer(1),
           PACKAGE = "mclust")[[3]]
}

traceW <- function(x)
{
  # sum(as.vector(sweep(x, 2, apply(x, 2, mean)))^2)
  dimx <- dim(x)
  n <- dimx[1]
  p <- dimx[2]
  .Fortran("mcltrw",
           as.double(x),
           as.integer(n),
           as.integer(p),
           double(p),
           double(1),
           PACKAGE = "mclust")[[5]]
}

unchol <- function(x, upper = NULL)
{
  if(is.null(upper)) {
    upper <- any(x[row(x) < col(x)])
    lower <- any(x[row(x) > col(x)])
    if(upper && lower)
      stop("not a triangular matrix")
    if(!(upper || lower)) {
      x <- diag(x)
      return(diag(x * x))
    }
  }
  dimx <- dim(x)
  storage.mode(x) <- "double"
  .Fortran("uncholf",
           as.logical(upper),
           x,
           as.integer(nrow(x)),
           as.integer(ncol(x)),
           integer(1),
           PACKAGE = "mclust")[[2]]
}

vecnorm <- function (x, p = 2) 
{
  if (is.character(p)) {
    if (charmatch(p, "maximum", nomatch = 0) == 1) 
      p <- Inf
    else if (charmatch(p, "euclidean", nomatch = 0) == 1) 
      p <- 2
    else stop("improper specification of p")
  }
  if (!is.numeric(x) && !is.complex(x)) 
    stop("mode of x must be either numeric or complex")
  if (!is.numeric(p)) 
    stop("improper specification of p")
  if (p < 1) 
    stop("p must be greater than or equal to 1")
  if (is.numeric(x)) 
    x <- abs(x)
  else x <- Mod(x)
  if (p == 2) 
    return(.Fortran("d2norm", as.integer(length(x)), as.double(x), 
                    as.integer(1), double(1), PACKAGE = "mclust")[[4]])
  if (p == Inf) 
    return(max(x))
  if (p == 1) 
    return(sum(x))
  xmax <- max(x)
  if (!xmax) 
    xmax <- max(x)
  if (!xmax) 
    return(xmax)
  x <- x/xmax
  xmax * sum(x^p)^(1/p)
}

errorBars <- function(x, upper, lower, width = 0.1, code = 3, angle = 90, horizontal = FALSE, ...) 
{ 
# Draw error bars at x from upper to lower. If horizontal = FALSE (default)
# bars are drawn vertically, otherwise horizontally.
  if(horizontal)
    arrows(upper, x, lower, x, length = width, angle = angle, code = code, ...)
  else  
    arrows(x, upper, x, lower, length = width, angle = angle, code = code, ...)
}

covw <- function(X, Z, normalize = TRUE)
# Given data matrix X(n x p) and weight matrix Z(n x G) computes
# weighted means(p x G), weighted covariance matrices S(p x p x G) and
# weighted scattering matrices W(p x p x G)
{
  X <- as.matrix(X)
  Z <- as.matrix(Z)
  n <- nrow(X)
  p <- ncol(X)
  nZ <- nrow(Z)
  G <- ncol(Z)
  if(n != nZ) 
    stop("X and Z must have same number of rows")
  if(normalize)
    Z <- t( apply(Z, 1, function(z) z/sum(z)) )
  
  tmp <- .Fortran("covwf",
                  X = as.double(X),
                  Z = as.double(Z),
                  n = as.integer(n),
                  p = as.integer(p),
                  G = as.integer(G),
                  mean = double(p*G),
                  S = double(p*p*G),
                  W = double(p*p*G),
                  PACKAGE = "mclust")
  
  out <- list(mean = matrix(tmp$mean, p,G), 
              S = array(tmp$S, c(p,p,G)),
              W = array(tmp$W, c(p,p,G)) )
  return(out)
}

hdrlevels <- function(density, prob)
{
# Compute the levels for Highest Density Levels (HDR) for estimated 'density' 
# values and probability levels 'prob'.
# 
# Reference: Hyndman (1996) Computing and Graphing Highest Density Regions	
  if(missing(density) | missing(prob))
    stop("Please provide both 'density' and 'prob' arguments to function call!")
  density <- as.vector(density)
  prob <- pmin(pmax(as.numeric(prob), 0), 1)
  alpha <- 1-prob
  lev <- quantile(density, alpha, na.rm = TRUE)
  names(lev) <- paste0(round(prob*100),"%")
  return(lev)
}

nclass.numpy <- function(x, ...)
{
#' Compute the number of classes for a histogram as the maximum of the 
#' "Sturges" and "FD" (Freedman Diaconis) estimators as in numpy library
#' for Python. 
#' x: a vector of values
#'
#' Example:
#' x <- rnorm(100)
#' hist(x, breaks = nclass.numpy(x))
#' x <- rnorm(1000)
#' hist(x, breaks = nclass.numpy(x))
#' n = c(50, seq(100,1000,by=100))
#' brks = rep(NA, length(n))
#' for(i in seq(n)) brks[i] = nclass.numpy(rnorm(n[i]))
#' plot(n, brks)
#' 
  x <- as.vector(x)
  max(nclass.Sturges(x), nclass.FD(x))
}

catwrap <- function(x, width = getOption("width"), ...)
{
# version of cat with wrapping at specified width
  cat(paste(strwrap(x, width = width, ...), collapse = "\n"), "\n")
}


##
## Convert to a from classes 'Mclust' and 'densityMclust' 
##

as.Mclust <- function(x, ...)
{ 
  UseMethod("as.Mclust")
}

as.Mclust.default <- function(x, ...)
{ 
  if(inherits(x, "Mclust")) x
  else stop("argument 'x' cannot be coerced to class 'Mclust'")
}

as.Mclust.densityMclust <- function(x, ...)
{ 
  class(x) <- c("Mclust", class(x)[1])
  return(x)
}

as.densityMclust <- function(x, ...)
{ 
  UseMethod("as.densityMclust")
}

as.densityMclust.default <- function(x, ...)
{ 
  if(inherits(x, "densityMclust")) x
  else stop("argument 'x' cannot be coerced to class 'densityMclust'")
}

as.densityMclust.Mclust <- function(x, ...)
{ 
  class(x) <- c("densityMclust", class(x))
  x$density <- dens(data = x$data, modelName = x$modelName, 
                    parameters = x$parameters, logarithm = FALSE)
  return(x)
}

Try the mclust package in your browser

Any scripts or data that you put into this service are public.

mclust documentation built on May 29, 2024, 8:06 a.m.