HMMt: HMM with Student's t emissions

#' BaumWelchT HMM
#' 
#' Estimate the HMM with emissions distributed as multi-dimensional correlated T variables
#' @author Guillaume Filion.
#' date: June 17, 2011.
#' @return the estimate of the HMM with emissions distributed
#' as multi dimensional correlated T variables.
#' @param x matrix of observations.
#' @param series.length length of independent blocks in matrix x.
#' @param m number of states.
#' @param Q transition matrix.
#' @param mu means of the emissions in different states.
#' @param S correlation matrix of the emissions.
#' @param nu degrees of freedom (in all states).
#' @param model a model matrix for parameter estimation.
#' @param initial.prob state probability at the start of series.
#' @param maxiter maximum number of iterations before returning.
#' @param overflow the probability ratio in case of numeric overflow.
#' @param num.inst threshold for no update (see E.step()).
#' @param tol threshold to difference in log-likelihood before returning.
#' @param dig numeric precision for parameter estimation.
#' @examples
#' library(HMMt)
#' x <- c(rt(1000, df=3), rt(1000, df=3)+1)
#' 
#' # x has a t distribution with a jump at position 1001.
#' plot(x, type = 'l')
#' 
#' # Check the output of BaumWelchT. 
#' BaumWelchT(x)
#' 
#' # See that it usually finds the transition.
#' lines(BaumWelchT(x)$ViterbiPath-1, col=2)
#' @export 
BaumWelchT <- function (x, series.length, m = 2, Q, mu, S, nu = TRUE,
     model, initial.prob, maxiter = 500, overflow = 1e-9, num.inst = 1e-9,
     tol = 1e-05, dig = 3) {

###############################################
#              OPTION PROCESSING              #
###############################################

    if (is.vector(x)) {
        x <- as.matrix(x);
    }
    p <- ncol(x)
    n <- nrow(x)

    if (missing(series.length)) {
        series.length <- n;
    }
    stopifnot (sum(series.length) == n);

    if (!missing(Q)) {
        if (!all.equal(dim(Q), c(m,m)))
            stop("Q must be an m by m matrix");
    }
    else {
        Q <- matrix(0.1/(m-1), ncol = m, nrow = m);
        diag(Q) <- 0.9;
    }

    # Assign initial values to mu by k-means or quantiles.
    if (missing(mu)) {
        if ((p == 1) && (m == 2)) {
            mu <- matrix(quantile(x = x, probs = c(0.5, 0.9),
                na.rm = TRUE), nrow = m);
        }
        else {
            clusters <- kmeans(x[complete.cases(x),], centers = m);
            mu <- clusters$centers;
        }
    }
    if (!all.equal(dim(mu), c(m, p))) {
        stop ("mu must be a m x p matrix");
    }

    # Assign initial values to covariance matrices.
    if (missing(S)) {
        S <- array(NA_real_, dim = c(p, p, m));
        for (i in 1:m) {
            S[,,i] <- var(sweep(x = x, MARGIN = 2,
                STATS = mu[i,]), na.rm = TRUE);
        }
    }     

    if (!is.infinite(nu)) {
        if (!length(nu) == 1)
            stop ("nu must be a vector of length 1");
        if (is.logical(nu)) {
            if (nu) {
                nu <- 6;
            }
            else {
                nu <- Inf;
            }
        }
    }

        if (missing(model)) {
        model <- array(1:m, dim=c(m,p));
    }
    else {
        stopifnot (all(dim(model) == c(m,p)));
    }

    adjust.initial.prob <- missing(initial.prob);




###############################################
#            VARIABLE DEFINITIONS             #
###############################################


    old.loglik <- -Inf;
    iter <- 0;

    # Updated in E.step (Q as well).
    emission.prob <- matrix(NA_real_, nrow = n, ncol = m)
    phi <- matrix(NA_real_, nrow = n, ncol = m)
    phi.weights <- matrix(NA_real_, nrow = n, ncol = m)
    loglik <- NA_real_


###############################################
#           FUNCTION DEFINITIONS              #
###############################################


    initial.steady.state.probabilities <- function () {
    # Compute the steady-state initial probabilities.

        spectre <- eigen(t(Q))

        if (is.complex(spectre$values[1]))
            return(rep(1/m,m))
        if (spectre$values[1] > 0.99)
            return(spectre$vectors[,1] / sum(spectre$vectors[,1]))

        return(rep(1/m,m))

    }

    # Perform the E-step of the modified Baum-Welch algorithm.
    # Note: the double arrow assignment <<- looks for the 
    # variable on the enclosing scopes. The design prevents
    # passing parameters by value for objects that can be
    # very large, such as 'emission.prob' and 'phi.weights'.
    # The M-step can yield invalid variance estimates because
    # of numeric underflow (see update.S() for example). In that
    # case no update is performed for the given state and the
    # algorithm 'waits' for valid parameter values.
    E.step <- function () {

        weights.i <- matrix(1, nrow = n, ncol = m);

        # Emission probabilities are computed up to constant terms.
        # The following piece of code is stolen from the function
        # mahalanobis.
        for (i in 1:m) {
            # do not update upon numerical instability.
            if (any(is.na(S[,,i])) ||
                   det(as.matrix(S[,,i])) < num.inst) {
                next;
            }
            x.minus.mu <- sweep(x = x, MARGIN = 2, STATS = mu[i,],
                check.margin = FALSE);
            psi.inv <- solve(S[,,i]);
            sqrt.det.psi <- 1/sqrt(abs(det(as.matrix(S[,,i]))));

            mahalanobis.i <- rowSums((x.minus.mu %*% psi.inv) *
                x.minus.mu);

            if (is.infinite(nu)) {
               emission.prob[,i] <<- sqrt.det.psi*exp(-mahalanobis.i/2);
            }
            else {
               emission.prob[,i] <<- sqrt.det.psi/
                  (1 + mahalanobis.i / nu)^((nu + p) / 2);
            }

            # Use Mahalanobis distances to compute the weights.
            if (!is.infinite(nu)) {
               weights.i[,i] <- (nu + p) / (nu + mahalanobis.i);
            }
        }

        # Emission probabilities of NAs are set to 1 for every states.
        emission.prob[is.na(emission.prob)] <<- 1;

        # Prevent outlier overflow.
        # If 'emission.prob' contains 'Inf', the values are scaled
        # to to the 'overflow' parameter.
        infid <- which(is.infinite(emission.prob), arr.ind = TRUE);
        emission.prob[infid[,1],] <- overflow;
        emission.prob[infid] <- 1;


        if (adjust.initial.prob) {
            initial.prob <- initial.steady.state.probabilities();
        }

        loglik <<- 0;
        cumulative.n <- 0;
        transitions <- matrix(double(m * m), nrow = m);

        counter = 0;

        for (n.i in series.length) {

            counter = counter+1;

            forwardback <- .Fortran("fwdb",
                as.integer(m),
                as.integer(n.i),
                initial.prob,
                emission.prob[(cumulative.n + 1):(cumulative.n + n.i),], Q,
                double(n.i),
                matrix(double(n.i * m), nrow = n.i),
                matrix(double(m^2), nrow = m), double(1),
                PACKAGE = "HMMt");

            loglik <<- loglik + forwardback[[9]];
            phi[(cumulative.n + 1):(cumulative.n + n.i),] <<-
                forwardback[[7]];
            transitions <- transitions + forwardback[[8]];

            cumulative.n <- cumulative.n + n.i;

        }

        Q <<- transitions / rowSums(transitions);
        phi.weights <<- phi*weights.i;

    }

    # Update the means according to the model matrix.
    update.mu <- function() {
        new.mu <- mu * NA
        for (i in 1:p) {
            sumPhi.weights.x <- colSums(phi.weights*x[,i], na.rm = TRUE);
            est <- tapply(X = sumPhi.weights.x, INDEX = model[,i],
                FUN = sum) / tapply(X = sumPhi.weights,
                INDEX = model[,i], FUN = sum);
            new.mu[,i] <- round(est[as.character(model[,i])], dig);
        }
        return (new.mu)
    }

    # Update the variances-covariances.
    update.S <- function() {
        new.S <- S * NA
        for (i in 1:p) {
            for (j in 1:i) {
                sumPhi.weights.xij <- 
                    colSums(phi.weights*x[,i]*x[,j], na.rm = TRUE);
                sumPhi.weightsmuij <- sumPhi.weights*mu[,i]*mu[,j];
                dev <- sumPhi.weights.xij - sumPhi.weightsmuij
                est <- tapply(X = dev, INDEX = model[,i], FUN = sum) / 
                    tapply(X = sumPhi, INDEX = model[,i], FUN = sum);
                new.S[i,j,] <- round(est[as.character(model[,i])], dig)
                if (i != j) {
                    new.S[j,i,] <- new.S[i,j,];
                } 
            }
        }
        # Numerical instability can lead to non-posititivity
        # of the variance-covariance matrix. In that case it is
        # set to NA (caught in E.step()).
        for (i in 1:m) {
            eig <- eigen(new.S[,,i]);
            if (any(eig$values < 0)) {
                new.S[,,i] <- NA
                #                eig$values[eig$values < 0] <- 0;
                #new.S[,,i] <- eig$vectors %*% diag(eig$values) %*%
                #    solve(eig$vectors);
            }
        }
        return (new.S)
    }

    update.nu <- function() {
        weights = rowSums(phi.weights)
        RHS <- 1 + mean((log(weights) - weights), na.rm = TRUE)
        # Find an upper bound.
        no.upper.bound <- TRUE
        new.nu <- nu
        lower.bound <- 0
        while (no.upper.bound) {
            if (digamma(new.nu/2) - digamma((new.nu + p)/2) -
                log(new.nu/2) + log((new.nu + p)/2) < RHS) {
                    lower.bound <- new.nu
                    new.nu <- 2 * new.nu
            }
            else {
                no.upper.bound <- FALSE
                upper.bound <- new.nu
            }
        }
        # The degree of freedom new.nu is estimated with a
        # precision of 0.01
        while (upper.bound - lower.bound > 0.01) {
            new.nu <- (upper.bound + lower.bound) / 2
            if (digamma(new.nu/2) - digamma((new.nu + p)/2) -
                log(new.nu/2) + log((new.nu + p)/2) < RHS)
                    lower.bound <- new.nu
            else {
                upper.bound <- new.nu
            }
        }

        # Note: rounding prevents oscillation of the estimates.
        return (round(new.nu, 1))
    }



###############################################
#                 MAIN LOOP                   #
###############################################

    for (iter in 1:maxiter) {

        cat(paste("\riteration:", iter))

        # Update Q, phi.weights and emission.prob.
        E.step()

        # CM-step 1: updating mu and S.
        sumPhi <- colSums(phi, na.rm = TRUE);
        sumPhi.weights <- colSums(phi.weights, na.rm = TRUE);

        mu <- update.mu();
        S <- update.S();

        if (!is.infinite(nu)) {

            # Update Q, phi.weights and emission.prob.
            E.step()

            # CM-step 2: updating nu.
            nu <- update.nu()
            if (nu > 250) {
                nu <- Inf;
            }

        } # if (!is.infinite(nu))


        if (abs(loglik - old.loglik) < tol)
            break

        old.loglik <- loglik

    } # for (iter in 1:maxiter)

    cat("\n")

    if (adjust.initial.prob) {
        initial.prob <- initial.steady.state.probabilities()
    }
        
    vPath <- Viterbi(Q, initial.prob, emission.prob, series.length)

    # Returns an object of thethe class BaumWelchTfit.
    return(new("fittedHiddenMarkovModel.t", Q = Q, mu = mu, S = S,
        nu = nu, model = model, ViterbiPath = vPath, phi = phi,
        logL = loglik, iterations = iter))

}