vignettes/GMCM-Standalone.R

#
# This script reproduce all of the results in the article:
#
#   "Unsupervised Clustering and Meta-Analysis
#    using Gaussian Mixture Copula Models"
#
# It is structured after specific sections of the paper.
#
# Written by Anders E Bilgrau
#

## ---- chunk_1 ----

#
# General initial parameters, packages, and auxiliary functions used throughout
#

rm(list = ls()) # Remove any objects in workspace (ensure a clean run)

# Set working directory if nessesary
# setwd("<the directory of this file>")

# Should all saved data be recomputed?
recompute <- FALSE
# Set recompute to TRUE if saved data is to be recomputed and not just loaded.
# WARNING! Setting will increase the running time to ~ 5 days on a regular
# laptop (depending on the use of parallel computations).
# Alternatively, "cache.RData" can be deleted or particular objects can
# be removed using rm() to force the recomputation

# We load all saved objects that are computationally time consuming
if (file.exists("cache.RData")) {
  load("cache.RData")
}


# Global plotting parameters used in some plots
axiscex <- 1.7

#
# Loading needed libraries
#

#install.packages(c("GMCM", "idr", "Hmisc", "RColorBrewer", "jpeg"))
library("GMCM")
library("idr")
library("Hmisc")
library("RColorBrewer")
library("jpeg")

# To enable multicore processing, change the %do% to %dopar% and
# uncomment the relevant packages below.
library("foreach")
# library("doMC") #library("doParallel") # Use this package on windows
# registerDoMC(detectCores())

# Defining auxiliary functions used
# Pretty big number formatting
prettyN <- function (x) {
  prettyNum(x, big.mark = "{,}", scientific = FALSE)
}

# Function for appending objects to an existing .RData file
# I used modified version of a function created by user "flodel":
# http://stackoverflow.com/questions/11813096/updating-an-existing-rdata-file
resave <- function(..., list = character(), file) {
  if (!file.exists(file)) { # If file does not exists resave functions as save
    save(..., list = list, file = file)
  }
  previous  <- load(file)  # Returns the loaded object names
  var.names <- c(list, as.character(substitute(list(...)))[-1L])
  for (var in var.names) {
    assign(var, get(var, envir = parent.frame()))
  }
  save(list = unique(c(previous, var.names)), file = file)
}


################################################################################
# Create Figure 1 of Section "2. Gaussian mixture copula models"
################################################################################

# Setting a seed for the simulation of data
set.seed(120312)

# Choosing some arbitrary marignal distributions for the general GMCM
inv.M1 <- function(u) {
  M1 <- function(u1) qbeta(u1, shape1 = 0.5, shape2 = 0.9)
  M2 <- function(u2) qchisq(u2, df = 5)
  u[,1] <- M1(u[,1])
  u[,2] <- M2(u[,2])
  return(u)
}

# Choosing some arbitrary marignal distributions for the special GMCM
inv.M2 <- function(u) {
  M1 <- function(u1) qbeta(u1, shape1 = 0.5, shape2 = 0.9)
  M2 <- function(u2) qbeta(u2, shape1 = 1.3, shape2 = 1.1)
  u[,1] <- M1(u[,1])
  u[,2] <- M2(u[,2])
  return(u)
}

# Simulate general GMCM data
theta1 <- list(m = 3, d = 2,
               pie = c(1/6,3/6,2/6),
               mu = list(comp1 = c(0,0),
                         comp2 = c(2,-1)*2,
                         comp3 = c(-1,-0.5)*2),
               sigma = list(comp1 = cbind(c(1,0),c(0,1)),
                            comp2 = cbind(c(2,1.78),c(1.78,2)),
                            comp3 = cbind(c(1,-0.53),c(-0.53,1))))
class(theta1) <- "theta"
data1 <- SimulateGMCMData(n = 10000, theta = theta1)
z1 <- data1$z
u1 <- data1$u
x1 <- inv.M1(u1)

# Simulate special GMCM data
par <- c(0.8, mu = 2, sigma = 1, rho = 0.8)
data2 <- SimulateGMCMData(n = 10000, d = 2, par = par)
theta2 <- data2$theta
z2 <- data2$z
u2 <- data2$u
x2 <- inv.M2(u2)

#
# Create Figure 1
#

jpeg("Figure1.jpg", height = 2*7*0.5, width = 3*7*0.5, units = "in", res = 100)
{
  lab.cex <- 1
  # Setting plotting parameters
  par(mgp = c(2.3,0.8,0),
      oma = c(0,2,0,0)+0.1,
      mar = c(3, 3.3, 2, 0.1),
      xaxs = "i", yaxs = "i",
      mfrow = c(2,3),
      xpd = TRUE)

  col1 <- c("white", "grey60")
  col2 <- c("black", "orange", "steelblue")
  pchs <- c(1,3,4)
  pcex <- 0.6
  labs <- pretty(seq(0,1))

  # Panel A: Observed
  plot(x1, axes = FALSE, type = "n", xlab = "", ylab = "")
  points(x1, col = col2[data1$K], pch = pchs[data1$K], cex = pcex)
  mtext(expression(paste(x[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(x[2])), 2, line = 2, cex = lab.cex)
  axis(1, labels = FALSE, at = axTicks(1), lwd = axiscex)
  axis(2, labels = FALSE, at = axTicks(2), lwd = axiscex)
  mtext("A", line = 0.9, adj = -0.05, cex = 1, font = 2)
  mtext("Observed process", line = 0.5, cex = 1, font = 2)
  mtext("General GMCM", line = 3.5, side = 2, cex = 1, font = 2)

  # Panel B: Copula density
  plot(1, type = "n", axes = FALSE, xlim = c(0,1), xlab = "",
       ylim = c(0,1), ylab = "")#, asp = 1)
  points(u1, col = col2[data1$K], pch = pchs[data1$K], cex = pcex)
  mtext(expression(paste(u[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(u[2])), 2, line = 2, cex = lab.cex)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("B", line = 0.9, adj = -0.05, cex = 1, font = 2)
  mtext("Copula process", line = 0.5, cex = 1, font = 2)

  # Panel C: Latent
  plot(1, type = "n", axes = FALSE, xlim = range(z1[,1]),
       xlab = "", ylim = range(z1[,2]), ylab = "")#, asp = 1)
  mtext(expression(paste(z[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(z[2])), 2, line = 2, cex = lab.cex)
  points(z1, col = col2[data1$K], pch = pchs[data1$K], cex = pcex)
  axis(1, lwd = axiscex)
  axis(2, lwd = axiscex)
  mtext("C", line = 0.9, adj = -0.05, cex = 1, font = 2)
  mtext("Latent process", line = 0.5, cex = 1, font = 2)


  col2 <- c("steelblue", "black", "orange")

  # Panel D: Observed process
  plot(x2, axes = FALSE, type = "n", xlab = "", ylab = "")
  points(x2, col = col2[data2$K], pch = pchs[data2$K], cex = pcex)
  mtext(expression(paste(x[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(x[2])), 2, line = 2, cex = lab.cex)
  axis(1, labels=FALSE, at = axTicks(1), lwd = axiscex)
  axis(2, labels=FALSE, at = axTicks(2), lwd = axiscex)
  mtext("D", line = 0.9, adj = -0.05, cex = 1, font = 2)
  mtext("Special GMCM", line = 3.5, side = 2, cex = 1, font = 2)

  # Panel E: Copula density
  plot(1, type = "n", axes = FALSE, xlim = c(0,1), xlab = "",
       ylim = c(0,1), ylab = "")#, asp = 1)
  points(u2, col = col2[data2$K], pch = pchs[data2$K], cex = pcex)
  mtext(expression(paste(u[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(u[2])), 2, line = 2, cex = lab.cex)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("E", line = 0.9, adj = -0.05, cex = 1, font = 2)

  # Panel F: Latent process
  plot(1, type = "n", axes = FALSE, xlim = range(z2[,1]), xlab = "",
       ylim = range(z2[,2]), ylab = "")#, asp = 1)
  points(z2, col = col2[data2$K], pch = pchs[data2$K], cex = pcex)
  mtext(expression(paste(z[1])), 1, line = 2, cex = lab.cex)
  mtext(expression(paste(z[2])), 2, line = 2, cex = lab.cex)
  axis(1, lwd = axiscex)
  axis(2, lwd = axiscex)
  mtext("F", line = 0.9, adj = -0.05, cex = 1, font = 2)

}
dev.off()


################################################################################
# The following performs the speed tests of section
# "4.2. Runtime and technical comparison"
################################################################################

# Set seed for the simulation of data
set.seed(1)

# Setting parameters for the simulation
# Start values:
alpha <- 0.5
mu    <- 2.5
sigma <- 0.5
rho   <- 0.8

speed.eps      <- 0.001  # Convergence epsilon
max.ite        <- 1000   # Maximum iterations
speed.par      <- c(0.7, 2, 1, 0.9)  # True parameters
n.observations <- c(1000, 10000, 100000) # Simulated observations


# We only do the computation is the saved speed.res is not present
# or if recompute is set to TRUE.
if (!exists("speed.res") | recompute) {

  speed.res <- foreach(n = n.observations, .combine = "rbind",
                       .packages = c("GMCM", "idr")) %do% {

    # Simulating data
    x <- SimulateGMCMData(n = n, par = speed.par)

    # Ranking and scaling
    u <- Uhat(x$u)

    # Li et al. (2011) PEM algorithm and results
    idr.time <- system.time({
      idr.out <- est.IDR(u, mu, sigma, rho, alpha,
                         eps = speed.eps, max.ite = max.ite)
    })

    if (length(idr.out$loglik.trace) == 2) {
      stop("est.IDR only took two interations")
    }

    li.res <- unlist(idr.out$para[c("p", "mu", "sigma", "rho")])

    # GMCM package PEM timing
    gmcm.time.pem <- system.time({
      my.res <- fit.meta.GMCM(u,  init.par = c(alpha, mu, sigma, rho),
                              method = "PEM", trace.theta = TRUE,
                              max.ite = max.ite, verbose = FALSE)
    })

    # GMCM package NM timing
    gmcm.time.nm <- system.time({
      my.res2 <- fit.meta.GMCM(u, init.par = c(alpha, mu, sigma, rho),
                               method = "NM", trace.theta = TRUE,
                               max.ite = max.ite, verbose = FALSE)
    })

    # Getting convergence information
    aa <- c(it = length(idr.out$loglik.trace),   idr.time[3])
    bb <- c(it = ncol(my.res[[2]]$loglik.tr),    gmcm.time.pem[3])
    cc <- c(it = unname(my.res2[[2]]$counts[1]), gmcm.time.nm[3])

    res <- rbind("idr-pkg PEM" = aa, "gmcm-pkg PEM" = bb, "gmcm-pkg NM" = cc)
    res <- cbind(res, "$s/n$" = res[,2]/res[,1])
    res <- cbind(res, "Rel.\\ speed" = res[,3]/res[3,3])
    res <- cbind(rbind(unlist(idr.out$para)[c(1,3,4,2)],
                       my.res[[1]], my.res2[[1]]), res)
    res <- cbind(n.obs = n, res)
    # speed.res <- rbind(speed.res, res)
    cat("Speed test with", n, "observations finished\n"); flush.console()

    return(res)
  }

  resave(speed.res, file = "cache.RData")
}

# Print the results
#print(speed.res)

################################################################################
# The following performs the comparison of the different fitting procedures
# in section "4.2. Runtime and technical comparison"
# NOTE: Some of the computations are VERY time consuming. I.e., the computation
# time is measured in days for some as warned below.
################################################################################

# The structure of this section is as follows:
# 1. Some initialization of parameters and aux. functions
# 2. Simulate the "n.sims" datasets and starting parameters
# 3. Fit the special model using NM to the generated datasets
# 4. Fit the special model using SANN ...
# 5. Fit the special model using L-BFGS ...
# 6. Fit the special model using L-BFGS-B ...
# 7. Fit the special model using PEM ...
# 8. Fit the special model using PEM from the idr-package ...
# 9. Format the results and create Figure 4

# Setting seed to ensure reproducibiity of results
set.seed(1)

# Setting parameters of simulations
n.sims  <- simulation.n.sims  <- 1000  # Number of simulated datasets
n.obs   <- simulation.n.obs   <- 10000 # Number of observations in each dataset
max.ite <- simulation.max.ite <- 2000  # Maximum number of interations

d <- 2 # Dimension of data

# Define the true parameters
true.par  <- simulation.true.par <-
  c(alpha1 = 0.90, mu = 3, sigma = 2, rho = 0.5)

# List the available fitting methods
methods   <- c("NM", "SANN", "L-BFGS", "L-BFGS-B", "PEM (GMCM)", "PEM (idr)")
n.methods <- length(methods)

# Define an error and warning handler
# The function is based off
# http://svn.r-project.org/R/trunk/src/library/base/demo/error.catching.R
tryCatch.W.E <- function(expr) {
  W <- NULL
  w.handler <- function(w){ # warning handler
    W <<- w
    invokeRestart("muffleWarning")
  }
  e.handler <- function(e){
    e
  }
  value <-
    withCallingHandlers(tryCatch(expr, error = e.handler), warning = w.handler)
  if ("error" %in% class(value)) {
    E <- value
    value <- NULL
  } else {
    E <- NULL
  }
  return(list("value" = value, "error" = E, "warning" = W))
}

#
# Simulate GMCM data and starting parameters
# Approximate runtime: 13 seconds
#

cat("Simulating", n.sims, "datasets and start parameters... "); flush.console()

sim.tmp <- foreach(i = seq_len(n.sims), .packages = "GMCM") %do% {
  # Draw random starting values and simulate from the GMCM
  return(list(startval = c(rbeta(1,1.5,1.5), abs(rnorm(1,3,1)),
                           rchisq(1,1), rbeta(1,1.5,1.5)),
              simdata = SimulateGMCMData(n = n.obs, par = true.par, d = d)))
}

simulation.start.par <- t(sapply(sim.tmp, "[[", "startval"))
colnames(simulation.start.par) <- names(true.par)

simulation.data <- lapply(sim.tmp, "[[", "simdata")

rm(sim.tmp)

# To save space in the RData file. Data is not saved. Here.
cat("Done.\n")


#
# Fit using Nelder-Mead
#

# Only do the computation if stored data is not available or if forced by
# recompute
# Approximate runtime: 15 minutes (6 mins in parallel)
if (!exists("simulation.res.NM") | recompute) {
  st.tot <- proc.time()
  simulation.res.NM <-
    foreach(i = seq_len(n.sims), .packages = "GMCM", .inorder = FALSE) %do% {

      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      st <- proc.time()
      tmp <-
        tryCatch.W.E(fit.meta.GMCM(u = x, init.par = simulation.start.par[i, ],
                                   method = "NM", trace.theta = TRUE,
                                   positive.rho = TRUE, max.ite = max.ite,
                                   verbose = FALSE, reltol = 1e-4))
      time <- (proc.time()-st)[3]
      par  <- tmp$value[[1]]
      ite  <- tmp$value[[2]]$counts[[1]]
      Khat <- (get.IDR(x, par)$idr <= 0.5) + 1
      acc  <- sum(diag(table(K, Khat)))/length(K)

      return(list("par" = par, "error" = tmp$error, "warning" = tmp$warning,
                  "ite" = ite, "acc" = acc, "time" = time))
    }

  resave(simulation.res.NM, file = "cache.RData")
  cat("\n## Nelder-Mead done ##\n\n")
  cat("Time:", (proc.time()-st.tot)[3] %/% 60 ,"min ellapsed\n")
}

#
# Fit using SANN
#

set.seed(65) # Since the SANN procedure is stochastic

# Approximate runtime: 309 minutes ~ 5 hours (WARNING !!!) (2.2 hrs with par)
#
if (!exists("simulation.res.SANN") | recompute) {
  st.tot <- proc.time()
  simulation.res.SANN <-
    foreach(i = seq_len(n.sims), .packages = "GMCM", .inorder = FALSE) %do% {

      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      st <- proc.time()
      tmp <-
        tryCatch.W.E(fit.meta.GMCM(u = x, init.par = simulation.start.par[i, ],
                                   method = "SANN", trace.theta = TRUE,
                                   positive.rho = TRUE, max.ite = 3000,
                                   verbose = FALSE))
      time <- (proc.time()-st)[3]
      par <- tmp$value[[1]]
      ite <- tmp$value[[2]]$counts[[1]]
      if(!is.null(par)) {
        Khat <- (get.IDR(x, par)$idr <= 0.5)+1
        acc <- sum(diag(table(K, Khat)))/length(K)
      } else {
        Khat <- NULL
        acc <- NULL
      }

      return(list("par" = par,
                  "error" = tmp$error,
                  "warning" = tmp$warning,
                  "ite" = ite,
                  "acc" = acc, "time" = time))
    }

  resave(simulation.res.SANN, file = "cache.RData")
  cat("\n## SANN done ##\n\n")
  cat("Time:", (proc.time()-st.tot)[3] %/% 60 ,"min ellapsed\n")
}

#
# Fit using L-BFGS
#

# Approximate runtime: 85 minutes ~ 1.5 hours (!!!)

if (!exists("simulation.res.LBFGS") | recompute) {
  st.tot <- proc.time()
  simulation.res.LBFGS <-
    foreach(i = seq_len(n.sims), .packages = "GMCM", .inorder = FALSE) %do% {

      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      st <- proc.time()
      tmp <-
        tryCatch.W.E(fit.meta.GMCM(u = x, init.par = simulation.start.par[i, ],
                                   method = "L-BFGS", trace.theta = TRUE,
                                   positive.rho = TRUE, max.ite = max.ite,
                                   verbose = FALSE, factr = 1e-4))
      time <- (proc.time()-st)[3]
      par <- tmp$value[[1]]
      ite <- tmp$value[[2]]$counts[[1]]
      if(!is.null(par)) {
        Khat <- (get.IDR(x, par)$idr<=0.5)+1
        acc <- sum(diag(table(K, Khat)))/length(K)
      } else {
        Khat <- NULL
        acc <- NULL
      }
      return(list("par" = par, "error" = tmp$error,
                  "warning" = tmp$warning, "ite" = ite,
                  "acc" = acc, "time" = time))
    }
  cat("\n## L-BFGS done ##\n\n")
  resave(simulation.res.LBFGS, file = "cache.RData")
}

#
# Fit using L-BFGS-B
#

# Approximate runtime: 96 minutes ~ 1.5 hours (half if parallel)

if (!exists("simulation.res.LBFGSB") | recompute) {
  st.tot <- proc.time()
  simulation.res.LBFGSB <-
    foreach(i = seq_len(n.sims), .packages = "GMCM", .inorder = FALSE) %do% {

      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      st <- proc.time()
      tmp <-
        tryCatch.W.E(fit.meta.GMCM(u = x, init.par = simulation.start.par[i, ],
                                   method = "L-BFGS-B", trace.theta = TRUE,
                                   positive.rho = TRUE, max.ite = max.ite,
                                   verbose = FALSE, factr = 1e-4))
      time <- (proc.time()-st)[3]
      par <- tmp$value[[1]]
      ite <- tmp$value[[2]]$counts[[1]]
      if(!is.null(par)) {
        Khat <- (get.IDR(x, par)$idr<=0.5)+1
        acc <- sum(diag(table(K, Khat)))/length(K)
      } else {
        Khat <- NULL
        acc <- NULL
      }

      return(list("par" = par, "error" = tmp$error, "warning" = tmp$warning,
                  "ite" = ite, "acc" = acc, "time" = time))
    }
  cat("\n\n## L-BFGS-B done ##\n\n\n")
  resave(simulation.res.LBFGSB, file = "cache.RData")
}


#
# Fit using PEM in GMCM
#

# Approximate runtime: 175 minutes ~ 3 hours (!!!)

if (!exists("simulation.res.PEM") | recompute) {
  st.tot <- proc.time()
  simulation.res.PEM <-
    foreach(i = seq_len(n.sims), .packages = "GMCM", .inorder = FALSE) %do% {
      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      st <- proc.time()
      tmp <-
        tryCatch.W.E(fit.meta.GMCM(u = x, init.par = simulation.start.par[i, ],
                                   method = "PEM", trace.theta = TRUE,
                                   positive.rho = TRUE, max.ite = max.ite,
                                   verbose = FALSE, eps = 1e-4))
      time <- (proc.time()-st)[3]
      par <- tmp$value[[1]]
      ite <- length(tmp$value[[2]]$theta.tr)
      if(!is.null(par)) {
        Khat <- (get.IDR(x, par)$idr<=0.5)+1
        acc <- sum(diag(table(K, Khat)))/length(K)
      } else {
        Khat <- NULL
        acc <- NULL
      }
      return(list("par" = par, "error" = tmp$error, "warning" = tmp$warning,
                  "ite" = ite, "acc" = acc, "time" = time))
    }
  cat("\n\n## PEM in GMCM done ##\n\n\n")
  resave(simulation.res.PEM, file = "cache.RData")
}

#
# Fit using PEM in idr
#

# WARNING: if the file is missing or recompute is TRUE,
# then running the following if condition will take a long time!

# Approximate runtime: 5810 minutes ~ 4 days (!!! WARNING !!!)

if (!exists("simulation.res.PEMidr") | recompute) {
  st.tot <- proc.time()
  simulation.res.PEMidr <-
    foreach(i = seq_len(n.sims), .packages = "idr", .inorder = FALSE) %do% {
      x <- Uhat(simulation.data[[i]]$u)
      K <- simulation.data[[i]]$K
      start.par <- simulation.start.par[i, ]
      st <- proc.time()
      tmp <-
        tryCatch.W.E(est.IDR(x = x, mu = start.par[2], sigma = start.par[3],
                             rho = start.par[4], p = 1-start.par[1],
                             eps = 1e-4, max.ite = max.ite))
      time <- (proc.time()-st)[3]
      par <- unlist(tmp$value$para)[c("p","mu","sigma","rho")]
      par[1] <- 1 - par[1]
      ite <- length(tmp$value$loglik.trace)
      if(!is.null(par)) {
        Khat <- (get.IDR(x, par)$idr<=0.5)+1
        acc <- sum(diag(table(K, Khat)))/length(K)
      } else {
        Khat <- NULL
        acc <- NULL
      }
      return(list("par" = par, "error" = tmp$error, "warning" = tmp$warning,
                  "ite" = ite, "acc" = acc, "time" = time))
    }
  cat("\n## PEM in idr done ##\n\n")
  resave(simulation.res.PEMidr, file = "cache.RData")
}



#
# Extract the information from the fitting procedures and
# create Figure 4
#

# Function to get data from the lists
Get <- function(x, y) {
  x <- lapply(x, "[[", y)
  x <- x[!sapply(x, is.null)]
  if (identical(x, list())) {
    integer(0)
  } else {
    simplify2array(x)
  }
}

# Function to handle some formatting
formatForFig <- function (x, digits = 2) {
  m1 <- mean(x, na.rm = TRUE)
  m2 <- sd(x, na.rm = TRUE)
  m1 <- formatC(m1, format = "f", digits = digits)
  m2 <- formatC(m2, format = "f", digits = digits)
  paste(m1, "\n(", m2, ")", sep = "")
}

# Initialize object to store information about the number of iterations to
# reach convergence, warning messages, and error messages.
simulation.ites  <- vector("list", length(methods))
warning.messages <- vector("list", length(methods))
error.messages   <- vector("list", length(methods))
names(simulation.ites)  <- rev(methods) # Naming the lists
names(warning.messages) <- rev(methods)
names(error.messages)   <- rev(methods)

# Parameter estimates
jpeg("Figure4.jpg", height = 7, width = 2*7, units = "in", res = 100)
{
  # Setting plotting parameters
  par(mfrow = c(1,6),
      mar = c(4.1, 0, 3.1, 0),
      oma = c(0,6,0,2),
      cex = 1.2,
      mgp = c(2.3,0.8,0))

  cols <- brewer.pal(n = length(methods), "Dark2") # Defining colours

  # Defining some helpful objects
  estimate  <- c("alpha[1]", "mu", "sigma", "rho")
  estimate2 <- c("hat(alpha)[1]", "hat(mu)", "hat(sigma)", "hat(rho)")
  ylims <- rbind(c(0, -10, 0, -0.5, 0), c(1, 10, 6, 1, 1))

  # Plotting the six "panels"
  for (i in 1:6) {
    if (i <= 4) {
      plot(1, ylim = c(0.5,6.5) ,
           type = "n", xlim = ylims[,i], axes = FALSE,
           ylab = "",
           xlab = parse(text = estimate2[i]))
      header <- parse(text=paste(estimate,"==",true.par,sep=""))[i]
      title(main = header)
      if (i == 1)
        axis(2, at = length(methods):1, lab = methods, las = 2, lwd = axiscex)
      abline(v = true.par[i], col = "dark gray")
      box(lwd = axiscex)
    } else {
      if (i == 6) {
        plot(1, xlim = c(0,2000), ylim = c(0.5,6.5), type = "n", axes = FALSE,
             xlab = "Iterations", ylab = "", main = "")
        axis(3, lwd = axiscex)
        box(lwd = axiscex)
      }
      if (i == 5) {
        plot(1, xlim = c(0,1), ylim = c(0.5,6.5), type = "n", axes = FALSE,
             xlab = "", ylab = "", xaxt = "n")
      }

    }

    if (i != 5) {
      if (i %in% 2:3) {
        lab <- c("", axTicks(1)[-c(1,length(axTicks(1)))], "")
      } else {
        lab <- axTicks(1)
      }
      axis(1, at = axTicks(1), labels = lab, lwd = axiscex)
    }

    for (j in length(methods):1) {

      data <- switch(j,
                     simulation.res.PEMidr,
                     simulation.res.PEM,
                     simulation.res.LBFGSB,
                     simulation.res.LBFGS,
                     simulation.res.SANN,
                     simulation.res.NM)


      not.failed <- sapply(data, function(x) is.null(x[["error"]]))
      pars <- simplify2array(lapply(data, "[[", "par")[not.failed])

      error.messages[[j]] <-
        sapply(data[!not.failed], function(x) x$error$message)

      warning.messages[[j]] <-
        Get(data,"warning")

      simulation.ites[[j]] <-
        n.ites <- Get(data, "ite")
      acc <- formatForFig(Get(data, "acc"))
      tim <- round(sum(Get(data, "time"), na.rm = TRUE) / 60)

      war <- length(Get(data,"warning"))/2
      err <- sum(!not.failed)

      if (i <= 4) {
        y <- pars[i, ]
        x <- rep(j, length(y))
        points(y,
               x + runif(length(x), -1, 1)*0.18,
               #x + rnorm(length(x), sd = 0.1),
               cex = 0.55, pch = 21,
               bg = paste(cols[j], "40", sep = ""),
               col = "#00000060")

        segments(median(y, na.rm = TRUE), y0 = j-0.3, y1 = j+0.3, lwd = 2)
      } else {
        if (i == 6)
          segments(n.ites, y0 = j-0.3, y1 = j+0.3,
                   col = paste(cols[j], "80", sep = ""))
        segments(median(n.ites, na.rm = TRUE), y0 = j-0.4, y1 = j+0.4,
                 lwd = 2)
        if (j == 5) text(j, 1000, "(3000)")
        if (i == 5) {
          labels <- c("Accuracy", "Time", "Warnings", "Errors")
          at <- seq(0.15,0.9,l = length(labels))
          axis(3, at = at, las = 2, labels = labels, tick = FALSE, hadj = 0.5,
               lwd = axiscex)
          axis(1, at = at, las = 2, labels = labels, tick = FALSE, hadj = 0.5,
               lwd = axiscex)

          text(at, j, c(acc, tim, war, err), cex = 0.8)
        }

      }
    }
  }
}
dev.off()

#
# Extract the number of times the maximum number interations was hit
#

simulation.max.ite.hit <-
  sapply(simulation.ites, function(x) sum(x %in% (simulation.max.ite+c(0,1))))

#print(simulation.max.ite.hit)


# Check errors and warnings
lapply(error.messages,
       function(x) table(if (length(x) == 0) ("Non") else (x)))

lapply(warning.messages,
       function(x) table(if (length(x) == 0) ("Non") else (unlist(x[1, ]))))


################################################################################
# The following performs the reproducibility analysis of u133VsExon
# ranked data using GMCM
# The results appear in section 5.1. Reproducibility of microarray results
################################################################################

# Load ranked data from GMCM package
data(u133VsExon) # The object is named u133VsExon

# Rename the data and Benjamini-Hochberg adjust the p-values
p.vals <- u133VsExon
adj.p.vals <- data.frame(u133 = p.adjust(p.vals$u133, method = "BH"),
                         exon = p.adjust(p.vals$exon, method = "BH"))

# Ranking and scaling using Uhat
u <- Uhat(1 - p.vals)  # Which is the same as Uhat(-log(p.vals))

# Define a list of three different intial parameters
init.par <- list(c(0.5, 1, 1, 0.5),
                 c(0.999, 5, 1, 0.5),
                 c(0.5, 1, 5, 0.999))

# Fitting GMCMs for each of the initial parameters
# We only do the computation if it is not present as and .RData object.
if (!exists("bcell.ite") | recompute) {
  log.lik <- -Inf
  for (i in 1:length(init.par)) {
    tmp.bcell.ite <- fit.meta.GMCM(u = u,
                                   init.par = init.par[[i]],
                                   method = "NM",
                                   trace.theta = TRUE)
    if (tmp.bcell.ite[[2]]$value > log.lik) {
      cat(i, "\n\n");
      bcell.ite <- tmp.bcell.ite
    }
  }
  resave(bcell.ite, file = "cache.RData")
}

# Extract the parameters and number of iterations
bcell.par <- bcell.ite[[1]]
bcell.n.ite <- unname(bcell.ite[[2]]$counts[1])

# Extractng the idr and IDR values
bcell.res <- get.IDR(x = u, par = bcell.par)

# Get marginally significant genes
exon.sig <- adj.p.vals$exon <= 0.05
u133.sig <- adj.p.vals$u133 <= 0.05

# Plotting the results and creating Figure 5
jpeg("Figure5.jpg", height = 7*0.5, width = 3*7*0.5, units = "in", res = 100)
{
  bcell.cols <- c("grey", "steelblue")#, "black")

  par(mgp = c(2.3,0.8,0),
      oma = c(0,0,0,0)+0.1,
      mar = c(3, 3.3, 2, 0.1),
      xaxs = "i", yaxs = "i",
      #mfrow = c(1,2),
      xpd = TRUE)
  layout(rbind(c(3,1,1,2,2,3)))

  # Determining colouring
  col.num <- (bcell.res$IDR <= 0.05) + 1

  labs <- pretty(seq(0,1))

  plot(u, main = "Ranked values",#"GC cells vs B-cells\nGMCM process",
       cex = 0.4, pch = 16, axes = FALSE, xlab = "", ylab = "",
       col = bcell.cols[col.num])
  mtext(expression(hat(z)[list(g, scriptstyle(U133))]), 1, line = 2, cex = 0.7)
  mtext(expression(hat(z)[list(g, scriptstyle(Exon))]), 2, line = 2, cex = 0.7)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("A", line = 0.9, adj = -0.05, cex = 1, font = 2)

  plot(GMCM:::qgmm.marginal(u = u, theta = meta2full(bcell.par, d = 2)),
       main = "Pseudo observations",#"GC cells vs B-cells\nEstimated GMM process",
       cex = 0.3, pch = 16, axes = FALSE, xlab = "", ylab = "",
       col = bcell.cols[col.num])
  mtext(expression(hat(z)[list(g, scriptstyle(U133))]), 1, line = 2, cex = 0.7)
  mtext(expression(hat(z)[list(g, scriptstyle(Exon))]), 2, line = 2, cex = 0.7)
  axis(1, lwd = axiscex)
  axis(2, lwd = axiscex)

  legend("topleft", pch = c(16, 16), col = bcell.cols, text.font = 2,
         legend = paste(c("Irreproducible genes\nn =",
                          "Reproducible genes\nn ="), table(bcell.res$Khat)),
         bty = "n", inset = 0.01, y.intersp = 2)
  par <- round(bcell.par,2)
  A <- bquote(alpha[1] == .(par[1]))
  B <- bquote(mu == .(par[2]))
  C <- bquote(sigma == .(par[3]))
  D <- bquote(rho == .(par[4]))
  legend("bottomright", bty = "n", inset = 0.01, y.intersp = 1,
         legend = c(A, B, C, D, expression()))
  mtext("B", line = 0.9, adj = -0.05, cex = 1, font = 2)
}
dev.off()



################################################################################
# The following performs the image segmentation in
# in section 5.3. Image segmentation using the general GMCM
################################################################################

# Define the number of colours to segment the data into
n.cols <- 10

fig7.file <- "./Figure7.jpg"

# Download the STS-27 atlantis take off (public domain)
# https://commons.wikimedia.org/wiki/File:Atlantis_taking_off_on_STS-27.jpg
if (!file.exists(fig7.file)) {
  url <- "https://user-images.githubusercontent.com/6087024/52737968-e5b29d00-2fcd-11e9-8fc5-90bda78e9ff5.jpg"
  download.file(url, destfile = fig7.file, mode = "wb")
}

# Read the image
pic <- readJPEG(fig7.file)

nn <- dim(pic)[1]
mm <- dim(pic)[2]

seg.gmcm <- seg.km <- pic  # To hold the different segmented pics

# Format the picture in a matrix with 3 columns
pic.rgbmat <- cbind(red   = c(pic[,,1]),
                    green = c(pic[,,2]),
                    blue  = c(pic[,,3]))

#
# Segmentation using GMCM
#

gmcm.file <- gsub(".jpg$", "_gmcm.RData", fig7.file)
# Approximate runtime: 50 minutes
if (!exists("seg.res.gmcm") | recompute) {
  best.loglik <- -Inf
  for (i in 1:10) {
    cat("i =", i, "\n"); flush.console()

    # Set a seed
    set.seed(i)

    # Choose starting parameters
    start.theta <- choose.theta(pic.rgbmat, m = n.cols,
                                iter.max = 10)

    # Fit the full GMCM
    seg.res.gmcm <-
      fit.full.GMCM(u = pic.rgbmat,
                    theta = start.theta,
                    method = "PEM",
                    max.ite = 100,
                    verbose = TRUE,
                    eps = 1e-4,
                    convergence.criterion = "GMCM")

    # Compute the log likelihood of the maximizing parameters
    loglik <- GMCM:::dgmcm.loglik(seg.res.gmcm, u = pic.rgbmat)

    # Update if a better estiamte is found
    if (best.loglik < loglik) {
      cat("Loglik updated (", loglik, ")\n", sep = ""); flush.console()
      best.i <- i
      best.loglik <- loglik
      best.seg.res.gmcm <- seg.res.gmcm
    }
  }

  # Save results
  seg.res.gmcm <- best.seg.res.gmcm
  resave(seg.res.gmcm, file = "cache.RData")
}

class(seg.res.gmcm) <- "theta" # HOTFIX

# Get the most likely component
gmcm.class <- apply(get.prob(pic.rgbmat, theta = seg.res.gmcm), 1, which.max)

# Extracting the colour for the estimated component
mus <- do.call(rbind, seg.res.gmcm$mu)
ranked.rgb.cols <- GMCM:::pgmm.marginal(mus, seg.res.gmcm)
rgb.cols <-
  sapply(1:3, function(i) quantile(x = pic.rgbmat[,i],
                                   prob = ranked.rgb.cols[,i],
                                   type = 1))

# Arrange into 3 dimensional array
gmcm.rgbmat <- rgb.cols[gmcm.class, ]
seg.gmcm[,,1] <- matrix(gmcm.rgbmat[,1], nn, mm)
seg.gmcm[,,2] <- matrix(gmcm.rgbmat[,2], nn, mm)
seg.gmcm[,,3] <- matrix(gmcm.rgbmat[,3], nn, mm)


#
# Segmentation using K-means clustering
#

km.file <- gsub(".jpg$", "_km.RData", fig7.file)
if (!exists("seg.res.km") | recompute) {
  system.time(
    seg.res.km <-
      kmeans(x = pic.rgbmat, centers = n.cols, iter.max = 100)
  )
  resave(seg.res.km, file = "cache.RData")
}

km.rgbmat <- seg.res.km$centers[seg.res.km$cluster, ]
seg.km[,,1] <- matrix(km.rgbmat[,1], nn, mm)
seg.km[,,2] <- matrix(km.rgbmat[,2], nn, mm)
seg.km[,,3] <- matrix(km.rgbmat[,3], nn, mm)

#
# Create segmented pictures
#

writeJPEG(seg.gmcm, target = gsub(".RData$", ".jpg", gmcm.file), quality = 0.8)
writeJPEG(seg.km,   target = gsub(".RData$", ".jpg", km.file),   quality = 0.8)



################################################################################
# The following performs the Fresh vs Frozen reproducibility analysis
# in section 5.2. Effects of cryopreservation on reproducibility
################################################################################

set.seed(1)

# Load ranked data from GMCM package
data(freshVsFrozen) # The object is named freshVsFrozen

freshfroz.tstat <- freshVsFrozen[, c(1, 3)]
freshVsFrozen$Fresh.adj.pval <-
  p.adjust(freshVsFrozen$PreVsPost.Fresh.pval,  method="BH")
freshVsFrozen$Frozen.adj.pval <-
  p.adjust(freshVsFrozen$PreVsPost.Frozen.pval, method="BH")
freshfroz.pval  <- freshVsFrozen[, c(2, 4)]
freshVsFrozen$adj.FreshVsFrozen.pval <-
  p.adjust(freshVsFrozen$FreshVsFrozen.pval, method = "BH")
# Absolute t-score and ranking
freshfroz <- Uhat(abs(freshfroz.tstat))
freshfroz.n.fits <- 40

if (!exists("best.par.freshfroz") | recompute) {
  best.ll <- -Inf
  best.par <- NULL
  best.init.par <- NULL
  for (i in 1:freshfroz.n.fits) {
    init.par <- c(runif(1, 0.05, 0.95), rchisq(2, df = 1), runif(1, 0.05, 0.95))

    # Fit model
    gmcm.par <-
      fit.meta.GMCM(u = freshfroz, init.par = init.par, method = "NM",
                    max.ite = 1000, reltol = 1e-5, verbose = FALSE)
    # Compute loglikelihood
    ll <- GMCM:::dgmcm.loglik(u = freshfroz, theta = meta2full(gmcm.par, d = 2))

    cat("Fit", i, "done.\tll =", sprintf("%.5f", ll),
        "\tbest.ll =", sprintf("%.5f", best.ll)); flush.console()
    if (ll > best.ll) {
      best.ll <- ll
      best.par <- gmcm.par
      best.init.par <- init.par
      cat("\t(best.ll updated!)"); flush.console()
    }
    cat("\n")
  }
  best.par.freshfroz <- best.par
  resave(best.par.freshfroz, file = "cache.RData")
}

# Get idr values
freshfroz.idr <- get.IDR(x = freshfroz, par = best.par.freshfroz)
freshfroz.group <- (freshfroz.idr$idr < 0.5) + (freshfroz.idr$IDR < 0.05) + 1

# Tests for dependency between significane of FreshVsFrozen compared to
# idr
suppressWarnings({
freshfroz.ctest <- cor.test(freshfroz.idr$idr, freshVsFrozen$FreshVsFrozen.pval,
                            method = "spearman")
})

freshfroz.table <- table(irreproducible = freshfroz.group == 1,
                     sig.diff = freshVsFrozen$adj.FreshVsFrozen.pval <= 0.05)
freshfroz.htest <- fisher.test(freshfroz.table)


#
# Overlap with differentially expressed genes across Fresh and Frozen
#

# Plotting and reporting
is.sig <- freshVsFrozen$adj.FreshVsFrozen.pval <= 0.05
ng <- c(table(freshfroz.group), sum(is.sig))
legend <- paste0(c("Irreproducible\nn = ",
                   "Reproducible\nn = ",
                   "Highly reproducible\nn = ",
                   "Adj. p-value < 0.05\n(Fresh vs Frozen)\nn = "), ng)

jpeg("Figure6.jpg", height = 7*0.5, width = 3*7*0.5, units = "in", res = 100)
{
  freshfroz.cols <- c("grey", "steelblue", "black", "red")
  freshfroz.cols2 <- freshfroz.cols[ifelse(is.sig, 4, freshfroz.group)]


  par(mfrow = c(1, 3),
      mar = c(3, 3.3, 2, 0.1),
      oma = c(0.1, 0.1, 0.1, 0.1),
      mgp = c(2.1, 1, 0),
      xaxs = "i", yaxs = "i",
      xpd = TRUE)
  labs <- pretty(seq(0,1))

  plot(1 - freshfroz.pval, #asp = 1,
       pch = 16, cex = 0.4, xlab = "", ylab = "", axes = FALSE,
       main = "1 - p-value", col = freshfroz.cols2)
  points(1 - freshfroz.pval[is.sig, ], pch = 15, cex = 0.6,
        col = freshfroz.cols2[is.sig])

  mtext(side = 1, expression(1 - p[list(g, scriptstyle(fresh))]),
        line = 2, cex = 0.6)
  mtext(side = 2, expression(1 - p[list(g, scriptstyle(frozen))]),
        line = 2, cex = 0.6)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("A", line = 0.9, adj = -0.05, cex = 1, font = 2)

  plot(freshfroz, xlab = "", ylab = "", pch = 16, cex = 0.4, #asp = 1,
       axes = FALSE, main = "Ranked values",
       col = freshfroz.cols2, xpd = TRUE)
  points(freshfroz[is.sig, ], pch = 15, cex = 0.6,
         col = freshfroz.cols2[is.sig])
  mtext(expression(hat(u)[list(g, scriptstyle(fresh))]), 1,
        line = 2, cex = 0.7, xpd = TRUE)
  mtext(expression(hat(u)[list(g, scriptstyle(frozen))]), 2,
        line = 2, cex = 0.7, xpd = TRUE)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("B", line = 0.9, adj = -0.05, cex = 1, font = 2)

  freshfroz.pseudo <-
    GMCM:::qgmm.marginal(u = freshfroz,
                         theta = meta2full(best.par.freshfroz, d = 2))
  plot(freshfroz.pseudo, xlab = "", ylab = "",  pch = 16, cex = 0.4, #asp = 1,
       axes = FALSE, main = "Pseudo observations",
       col = freshfroz.cols2, xpd = TRUE)
  points(freshfroz.pseudo[is.sig, ], pch = 15, cex = 0.6,
         col = freshfroz.cols2[is.sig])
  mtext(expression(hat(z)[list(g,scriptstyle(fresh))]), 1,
        line = 2, cex = 0.7,xpd = TRUE)
  mtext(expression(hat(z)[list(g,scriptstyle(frozen))]), 2,
        line = 2, cex = 0.7, xpd = TRUE)
  axis(1); axis(2)
  mtext("C", line = 0.9, adj = -0.05, cex = 1, font = 2)

  legend("topleft", pch = c(16,16,16), col = freshfroz.cols[-4], text.font = 2,
         pt.cex = rep(0.5,3)*2.5,
         inset = 0.01, legend = legend[-4],
         bg = "#FFFFFF00", bty = "n", y.intersp = 2)
  legend("bottomright", pch = 15, col = freshfroz.cols[4], text.font = 2,
         pt.cex = 0.6*2.5,
         inset = 0.01, legend = legend[4],
         bg = "#FFFFFF00", bty = "n", y.intersp = 2)
}
dev.off()

## ----
# chunk_1 ends here!



################################################################################
# The following generates the results which appear as chunks in the
# Sweave document in section 5.2. Image segmentation using the general GMCM
################################################################################

# Chunk 1 runs this entire script above

# Chunk 2 demo
## ---- start_example ----
library("GMCM")
set.seed(100)
n <- 10000
sim <- SimulateGMCMData(n = n, theta = rtheta(m = 3, d = 2))
## ----


# Chunk 3 simply evaluates Chunk 2


# Chunk 4 create figure 2
## ---- make_simulation_example_png ----
jpeg("Figure2.jpg", width = 3*7*0.5, height = 7*0.5, units = "in", res = 100)
{
  par(mar = c(3, 3.3, 2, 0.1),
      oma = c(0, 0, 0, 0) + .1,
      mgp = c(2.1, 1, 0),
      xaxs = "i", yaxs = "i",
      xpd = TRUE)
  layout(rbind(c(3,1,2,3)), widths = c(1/6, 1/3, 1/3, 1/6))

  pchs <- c(1,3,4)
  eks.col <- c("orange", "black", "steelblue")
  labs <- pretty(seq(0,1))

  # Panel A
  plot(sim$z, col = eks.col[sim$K], main = "Latent GMM process", axes = FALSE,
       xlab = "", ylab = "", cex = 0.9, pch = pchs[sim$K])
  mtext(expression(z[1]), 1, line = 2, cex = 0.6)
  mtext(expression(z[2]), 2, line = 2, cex = 0.6)
  axis(1, lwd = axiscex)
  axis(2, lwd = axiscex)
  mtext("A", line = 0.9, adj = -0.05, cex = 1, font = 2)

  # Panel B
  plot(sim$u, col = eks.col[sim$K], main = "GMCM process", axes = FALSE,
       xlab = "", ylab = "", cex = 0.9, pch = pchs[sim$K])
  mtext(expression(u[1]), 1, line = 2, cex = 0.6)
  mtext(expression(u[2]), 2, line = 2, cex = 0.6)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("B", line = 0.9, adj = -0.05, cex = 1, font = 2)
}
dev.off()
## ----


# Chunk 6
# Simply times chunk 5 and caputures output
st <- proc.time()
info <- capture.output({
# Chunk 5
## ---- run_example ----
ranked.data <- Uhat(sim$u)
start.theta <- choose.theta(ranked.data, m = 3)
mle.theta <- fit.full.GMCM(u = ranked.data, theta = start.theta,
                           method = "NM", max.ite = 10000, reltol = 1e-4)
kappa <- get.prob(ranked.data, theta = mle.theta)
Khat <- apply(kappa, 1, which.max)
## ----
})
elapsed <- round((proc.time() - st)[3], 1)
nm.ite <- as.numeric(sub(" *([0-9]+) [a-z ]+", "\\1", info[length(info)]))


# Chunk 7 (Creates Figure 3)
## ---- simulation_example_results ----
fit <- mle.theta
Khat.tmp <- rep(NA, length(Khat))
Khat.tmp[Khat==1] <- 1
Khat.tmp[Khat==2] <- 2
Khat.tmp[Khat==3] <- 3
Khat <- Khat.tmp

# Plot the results
jpeg("Figure3.jpg", width = 3*7*0.5, height = 7*0.5, units = "in", res = 100)
{

  par(mfrow = c(1, 3),
      mar = c(3, 3.3, 2, 0.1),
      oma = c(0, 0, 0, 0) + 0.1,
      mgp = c(2.1, 1, 0),
      xaxs = "i", yaxs = "i",
      xpd = TRUE)
  labs <- pretty(seq(0,1))

  plot(sim$u, col = eks.col[Khat], main = "Clustering", axes = FALSE,
       xlab = "", ylab = "", cex = 0.7,
       pch = pchs[Khat])
  mtext(expression(u[1]), 1, line = 2, cex = 0.6)
  mtext(expression(u[2]), 2, line = 2, cex = 0.6)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("A", line = 0.9, adj = -0.05, cex = 1, font = 2)

  simfit <- SimulateGMCMData(n, theta = fit)
  plot(simfit$z, col = eks.col[simfit$K],
       main = "Model check 1", axes = FALSE,
       xlab = "", ylab = "", cex = 0.7,
       pch = pchs[simfit$K])
  mtext(expression(z[1]), 1, line = 2, cex = 0.6)
  mtext(expression(z[2]), 2, line = 2, cex = 0.6)
  axis(1, lwd = axiscex)
  axis(2, lwd = axiscex)
  mtext("B", line = 0.9, adj = -0.05, cex = 1, font = 2)

  plot(simfit$u, col = eks.col[simfit$K],
       main = "Model check 2", axes = FALSE,
       xlab = "", ylab = "", cex = 0.7, pch = pchs[simfit$K])
  mtext(expression(u[1]), 1, line = 2, cex = 0.6)
  mtext(expression(u[2]), 2, line = 2, cex = 0.6)
  axis(1, at = labs, lwd = axiscex)
  axis(2, at = labs, lwd = axiscex)
  mtext("C", line = 0.9, adj = -0.05, cex = 1, font = 2)
}
dev.off()
## ----



# Chunk 8 (Creates the confusion table)
## ---- confusion_table ----
confusion.tab <- table(K = sim$K, Khat)
acc <- round(sum(diag(confusion.tab))/n,3)*100
km.tmp  <- km <- kmeans(ranked.data, centers = 3)$cluster

# Rename clusters to get correct colours and
# make the confusion matrix easy to read
km.tmp[km == 1] <- 2
km.tmp[km == 2] <- 3
km.tmp[km == 3] <- 1
km <- km.tmp
confusion.tab.km <- table(K = sim$K, Khat = km)
acc.km <- round(sum(apply(confusion.tab.km, 1, max))/n, 3)*100

latex(cbind(confusion.tab, confusion.tab.km),
      cgroup = c("$\\hat{H}$ (GMCM)", "$\\hat{H}$ ($k$-means)"),
      n.cgroup = c(3,3),
      rgroup = "$H$",
      n.rgroup = 3,
      title = "",
      file = "",
      label = "confusion.mat",
      caption.loc = "bottom",
      caption = "Confusion matrices of GMCM and $k$-means clustering results.")
## ----


# Chunk 9
## ---- table_speed_res ----
# Formatting the results stored in the speed.res object
speed.res <-
  cbind(Package = toupper(gsub("-pkg (PEM|NM)", "", rownames(speed.res))),
        Algorithm = gsub("(gmcm|idr)-pkg ", "", rownames(speed.res)),
        as.data.frame(speed.res, row.names = 1:nrow(speed.res)))
speed.res <- as.data.frame(speed.res)
speed.res$elapsed <- round(speed.res$elapsed, 2)
speed.res$"Rel.\\ speed" <- round(speed.res$"Rel.\\ speed", 1)
colnames(speed.res) <-  # Some extra formatting of the table
  gsub("elapsed", "Runtime ($s$)",
       gsub("^it$", "Iterations ($n$)",
            colnames(speed.res)))
speed.res[, c(4:7, 10)] <-
  round(speed.res[, c(4:7, 10)],3)
latex(speed.res[,-c(1,3,4:7)][,c(1,3,2,4,5)],
      title = "$p$ / Package",
      file = "",
      caption.loc = "bottom",
      caption = paste("Runtime comparisons of the \\pkg{idr} and",
                      "\\pkg{GMCM} packages with increasing number",
                      "of observations $p$. The benchmarked",
                      "optimization procedures are the pseudo EM algorithm ",
                      "(PEM) and the Nelder-Mead (NM) method.",
                      "The runtime is given in seconds.",
                      "The last column shows the relative speed per",
                      "iteration compared to the fastest procedure."),
      label = "speed.tab",
      rowname = paste("\\pkg{", gsub("IDR", "idr", speed.res$Package), "}"),
      rgroup = unique(format(speed.res$n.obs,
                             scientific = FALSE, big.mark = ",")))
## ----


# Chunk 10
## ---- table_equivalent_optima ----
tab <- matrix("$\\cdot$", 3, 4)
colnames(tab) <- c("\\alpha_1", "\\mu", "\\sigma", "\\rho")
colnames(tab) <- paste("$", colnames(tab), "$", sep = "") #"\\approx$"
rownames(tab) <- 1:3
tab[1,1] <- "$1$"
tab[2,c(1,4)] <- c("$0$", "$0$")
tab[3,-1] <- c("$0$", "$1$", "$0$")
caption <- paste("Equivalent optima in pure noise. A dot ($\\cdot$) denotes",
                 "an arbitrary value. The given values need only to be",
                 "approximate.")
latex(tab, title = "Situation",
      caption = caption,
      label = "BadEst",
      file = "",
      caption.loc = "bottom")
## ----


# Chunk 11
## ---- freshfroz_group_pct ----
tmp <- table(freshfroz.group)
freshfroz.group.pct <-
  paste0("$", prettyN(tmp), "$ $(", 100*round(tmp/sum(tmp), 3), "\\%)$")
freshfroz.group.pct2 <-
  paste0("$", prettyN(sum(tmp[2:3])), "$ $(",
         100*round(sum(tmp[2:3])/sum(tmp), 3), "\\%)$")
## ----

Try the GMCM package in your browser

Any scripts or data that you put into this service are public.

GMCM documentation built on Nov. 6, 2019, 1:08 a.m.