R/ggs.R
In ggmcmc: Tools for Analyzing MCMC Simulations from Bayesian Inference

Documented in custom.sort ggs ggs_chain

#' Import MCMC samples into a ggs object than can be used by all ggs_* graphical functions.
#'
#' This function manages MCMC samples from different sources (JAGS, MCMCpack, STAN -both via rstan and via csv files-) and converts them into a data frame tibble. The resulting data frame has four columns (Iteration, Chain, Parameter, value) and six attributes (nChains, nParameters, nIterations, nBurnin, nThin and description). The ggs object returned is then used as the input of the ggs_* functions to actually plot the different convergence diagnostics.
#'
#' @references Fernández-i-Marín, Xavier (2016) ggmcmc: Analysis of MCMC Samples and Bayesian Inference. Journal of Statistical Software, 70(9), 1-20. doi:10.18637/jss.v070.i09
#' @references Gelman, Carlin, Stern, Dunson, Vehtari and Rubin (2014) Bayesian Data Analysis. 3rd edition. Chapman & Hall/CRC, Boca Raton.
#' @param S Either a \code{mcmc.list} object with samples from JAGS, a \code{mcmc} object with samples from MCMCpack, a \code{stanreg} object with samples from rstanarm, a \code{brmsfit} object with samples from brms, a \code{stanfit} object with samples from rstan, or a list with the filenames of \code{csv} files generated by stan outside rstan (where the order of the files is assumed to be the order of the chains). ggmcmc guesses what is the original object and tries to import it accordingly. rstan is not expected to be in CRAN soon, and so coda::mcmc is used to extract stan samples instead of the more canonical rstan::extract.
#' @param family Name of the family of parameters to process, as given by a character vector or a regular expression. A family of parameters is considered to be any group of parameters with the same name but different numerical value between square brackets (as beta[1], beta[2], etc).
#' @param description Character vector giving a short descriptive text that identifies the model.
#' @param burnin Logical or numerical value. When logical and TRUE (the default), the number of samples in the burnin period will be taken into account, if it can be guessed by the extracting process. Otherwise, iterations will start counting from 1. If a numerical vector is given, the user then supplies the length of the burnin period.
#' @param par_labels data frame with two colums. One named "Parameter" with the same names of the parameters of the model. Another named "Label" with the label of the parameter. When missing, the names passed to the model are used for representation. When there is no correspondence between a Parameter and a Label, the original name of the parameter is used. The order of the levels of the original Parameter does not change.
#' @param sort Logical. When TRUE (the default), parameters are sorted first by family name and then by numerical value.
#' @param keep_original_order Logical. When TRUE, parameters are sorted using the original order provided by the source software. Defaults to FALSE.
#' @param splitting Logical. When TRUE, use the approach suggested by Gelman, Carlin, Stern, Dunson, Vehtari and Rubin (2014) Bayesian Data Analysis. 3rd edition. This implies splitting the sequences (original chains) in half, and treat each half as a different Chain, therefore effectively doubling the number of chains. In this case, the first half of Chain 1 is still Chain 1 , but the second half is turned into Chain 2, and the first half of Chain 2 into Chain 3, and so on. Defaults to FALSE.
#' @param inc_warmup Logical. When dealing with stanfit objects from rstan, logical value whether the warmup samples are included. Defaults to FALSE.
#' @param stan_include_auxiliar Logical value to include "lp__" parameter in rstan, and "lp__", "treedepth__" and "stepsize__" in stan running without rstan. Defaults to FALSE.
#' @export
#' @return D A data frame tibble with the data arranged and ready to be used by the rest of the \code{ggmcmc} functions. The data frame has four columns, namely: Iteration, Chain, Parameter and value, and six attributes: nChains, nParameters, nIterations, nBurnin, nThin and description. A data frame tibble is a wrapper to a local data frame, behaves like a data frame and its advantage is related to printing, which is compact. For more details, see \code{as_tibble()} in package \code{dplyr}.
#' @examples
#' # Assign 'S' to be a data frame suitable for \code{ggmcmc} functions from
#' # a coda object called s
#' data(linear)
#' S <- ggs(s)        # s is a coda object
#'
#' # Get samples from 'beta' parameters only
#' S <- ggs(s, family = "beta")
ggs <- function(S, family = NA, description = NA, burnin = TRUE, par_labels = NA,
                sort = TRUE, keep_original_order = FALSE, splitting = FALSE,
                inc_warmup = FALSE, stan_include_auxiliar = FALSE) {
  processed <- FALSE # set by default that there has not been any processed samples
  #
  # Manage stanfit obcjets
  # Manage stan output first because it is firstly converted into an mcmc.list
  #
  original.object.class <- class(S)[1]
  if (length(class(S)) > 1 & class(S)[1] == "stanreg") {
    S <- S$stanfit
    # From now on, the original object with multiple classes only has one class
  }
  if (class(S)== "brmsfit") {
    S <- S$fit
  }
  if (class(S)=="stanfit") {
    # Extract chain by chain
    nChains <- S@sim$chains
    nThin <- S@sim$thin
    mDescription <- S@model_name
    D <- NULL
    for (l in 1:nChains) {
      sdf <- as.data.frame(S@sim$samples[[l]])
      names(sdf) <- names(S@sim$samples[[l]])
      sdf$Iteration <- 1:dim(sdf)[1]
      s <- tidyr::gather(sdf, Parameter, value, -Iteration) %>%
        dplyr::mutate(Chain = l) %>%
        dplyr::select(Iteration, Chain, Parameter, value)
      D <- dplyr::bind_rows(D, s)
    }
    if (!inc_warmup) {
      if (original.object.class == "stanfit") {
        D <- dplyr::filter(D, Iteration > (S@sim$warmup / nThin))
        D$Iteration <- D$Iteration - (S@sim$warmup / nThin)
      }
      nBurnin <- S@sim$warmup
    } else {
      nBurnin <- 0
    }
    # Exclude, by default, lp parameter
    if (!stan_include_auxiliar) {
      D <- dplyr::filter(D, Parameter!="lp__") # delete lp__
    }
    if (sort) {
      D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
    } else {
      D$Parameter <- factor(D$Parameter)
    }
    processed <- TRUE
    D <- dplyr::as_tibble(D)
  }
  #
  # Manage csv files than contain stan samples
  # Also converted first to an mcmc.list
  #
  if (class(S)=="list") {
    D <- NULL
    for (i in 1:length(S)) {
      samples.c <- dplyr::as_tibble(read.table(S[[i]], sep=",", header=TRUE,
        colClasses="numeric", check.names=FALSE))
      D <- dplyr::bind_rows(D,
        tidyr::gather(samples.c, Parameter) %>%
        dplyr::mutate(Iteration=rep(1:(dim(samples.c)[1]), dim(samples.c)[2]), Chain=i) %>%
        dplyr::select(Iteration, Chain, Parameter, value))
    }
    # Exclude, by default, lp parameter and other auxiliar ones
    if (!stan_include_auxiliar) {
      D <- D[grep("__$", D$Parameter, invert=TRUE),]
      if (sort) {
        D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
      } else {
        D$Parameter <- factor(D$Parameter)
      }
    }
    nBurnin <- as.integer(gsub("warmup=", "", scan(S[[i]], "", skip=12, nlines=1, quiet=TRUE)[2]))
    nThin <- as.integer(gsub("thin=", "", scan(S[[i]], "", skip=13, nlines=1, quiet=TRUE)[2]))
    processed <- TRUE
  }
  #
  # Manage mcmc.list and mcmc objects
  #
  if (class(S)=="mcmc.list" | class(S)=="mcmc" | processed) {  # JAGS typical output or MCMCpack (or previously processed stan samples)
    if (!is.na(family)) {
      requireNamespace("coda")
      if (!processed) {
        location.family <- grep(family, dimnames(S[[1]])[[2]])
        S <- S[,location.family, drop = FALSE]
      } else { # for already processed samples, use dplyr()'s own filter based on grepl
        D <- D %>%
          filter(grepl(family, Parameter)) %>%
          mutate(Parameter = as.factor(as.character(Parameter)))
      }
    }
    if (!processed) { # only in JAGS or MCMCpack, using coda
      lS <- length(S)
      D <- NULL
      if (lS == 1 | class(S)=="mcmc") { # Single chain or MCMCpack
        if (lS == 1 & class(S)=="mcmc.list") { # single chain
          s <- S[[1]]
        } else { # MCMCpack
          s <- S
        }
        # Keep a record of original names
        if (keep_original_order) {
          parameter.names.original.order <- dimnames(s)[[2]]
        }
        # Process a single chain
        D <- dplyr::mutate(ggs_chain(s), Chain=1) %>%
          dplyr::select(Iteration, Chain, Parameter, value)
        # Get information from mcpar (burnin period, thinning)
        nBurnin <- (attributes(s)$mcpar[1])-(1*attributes(s)$mcpar[3])
        nThin <- attributes(s)$mcpar[3]
      } else {
        # Keep a record of original names
        if (keep_original_order) {
          parameter.names.original.order <- dimnames(S[[1]])[[2]]
        }
        # Process multiple chains
        for (l in 1:lS) {
          s <- S[l][[1]]
          D <- dplyr::bind_rows(D, dplyr::mutate(ggs_chain(s), Chain=l))
        }
        D <- dplyr::select(D, Iteration, Chain, Parameter, value)
        # Get information from mcpar (burnin period, thinning). Taking the last
        # chain is fine. All chains are assumed to have the same structure.
        nBurnin <- (attributes(s)$mcpar[1])-(1*attributes(s)$mcpar[3])
        nThin <- attributes(s)$mcpar[3]
      }
      if (sort) {
        D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
      } else {
        D$Parameter <- factor(D$Parameter)
      }
      if (keep_original_order) {
        D$Parameter <- factor(D$Parameter, levels = parameter.names.original.order)
      }
      D <- dplyr::arrange(D, Parameter, Chain, Iteration)
    }
    # Set several attributes to the object, to avoid computations afterwards
    # Number of chains
    attr(D, "nChains") <- length(unique(D$Chain))
    # Number of parameters
    attr(D, "nParameters") <- length(unique(D$Parameter))
    # Number of Iterations really present in the sample
    attr(D, "nIterations") <- max(D$Iteration)
    # Number of burning periods previously
    if (is.numeric(burnin) & length(burnin)==1) {
      attr(D, "nBurnin") <- burnin
    } else if (is.logical(burnin)) {
      if (burnin) {
        attr(D, "nBurnin") <- nBurnin
      } else {
        attr(D, "nBurnin") <- 0
      }
    } else {
      stop("burnin must be either logical (TRUE/FALSE) or a numerical vector of length one.")
    }
    # Thinning interval
    attr(D, "nThin") <- nThin
    # Descriptive text
    if (is.character(description)) { # if the description is given, us it when it is a character string
      attr(D, "description") <- description
    } else {
      if (!is.na(description)) { # if it is not a character string and not NA, show an informative message
        message("description is not a text string. The name of the imported object is used instead.")
      }
      if (exists("mDescription")) { # In case of stan model names
        attr(D, "description") <- mDescription
      } else {
        attr(D, "description") <- as.character(sys.call()[2]) # use the name of the source object
      }
    }
    # Manage subsetting a family of parameters
    # In order to save memory, the exclusion of parameters would be done ideally
    # at the beginning of the processing, but then it has to be done for all
    # input types.
    if (!is.na(family)) {
      D <- get_family(D, family=family)
    }
    # Change the names of the parameters if par_labels argument has been passed
    if (length(which(class(par_labels) %in% c("data.frame", "tbl_df"))) >= 1) {
      if (length(which(c("Parameter", "Label") %in% names(par_labels))) == 2) {
        aD <- attributes(D)
        levels(D$Parameter)[which(levels(D$Parameter) %in% par_labels$Parameter)] <-
          as.character(par_labels$Label[
            match(levels(D$Parameter)[which(levels(D$Parameter) %in% par_labels$Parameter)], par_labels$Parameter)])
        L <- dplyr::as_tibble(data.frame(Parameter = par_labels$Label, ParameterOriginal = par_labels$Parameter)) %>%
          mutate(Parameter = as.character(Parameter)) # dplyr chrashes as of 160428 development version if Parameter is factor
        D <- suppressWarnings(dplyr::left_join(D, L, by = "Parameter"))
        D <- D %>%
          dplyr::select(Iteration, Chain, Parameter, value, ParameterOriginal)
        if (class(D$Parameter) == "character") {
          if (sort) {
            D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
          } else {
            D$Parameter <- factor(D$Parameter)
          }
        }
        # Unfortunately, the attributes are not inherited in left_join(), so they have to be manually passed again
        attr(D, "nChains") <- aD$nChains
        attr(D, "nParameters") <- aD$nParameters
        attr(D, "nIterations") <- aD$nIterations
        attr(D, "nBurnin") <- aD$nBurnin
        attr(D, "nThin") <- aD$nThin
        attr(D, "description") <- aD$description
        # Keep the rest of the variables passed if the data frame has more than Parameter and Label
        if (dim(par_labels)[2] > 2) {
          aD <- attributes(D)
          L.noParameter <- dplyr::as_tibble(par_labels) %>%
            dplyr::select(-Parameter) %>%
            dplyr::mutate(Label = as.character(Label))
          D <- suppressWarnings(dplyr::left_join(D, L.noParameter, by=c("Parameter" = "Label")))
          if (class(D$Parameter) == "character") {
            if (sort) {
              D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
            } else {
              D$Parameter <- factor(D$Parameter)
            }
          }
        }
        # Unfortunately, the attributes are not inherited in left_join(), so they have to be manually passed again (for second time).
        attr(D, "nChains") <- aD$nChains
        attr(D, "nParameters") <- aD$nParameters
        attr(D, "nIterations") <- aD$nIterations
        attr(D, "nBurnin") <- aD$nBurnin
        attr(D, "nThin") <- aD$nThin
        attr(D, "description") <- aD$description
      } else {
        stop("par_labels must include at least columns called 'Parameter' and 'Label'.")
      }
    } else {
      if (!is.na(par_labels)) {
        stop("par_labels must be a data frame or a tibble.")
      }
    }
    # Perform the splitting into chain halves
    if (splitting) {
      original.attributes <- attributes(D)
      # If the original Chain is odd, discard the last one before proceeding
      if ((original.attributes$nIterations %% 2) != 0) {
        D <- dplyr::filter(D, Iteration < max(Iteration))
        original.attributes$nIterations <- original.attributes$nIterations - 1
      }
      D <- D %>%
        dplyr::mutate(second.half = ifelse(Iteration > (original.attributes$nIterations / 2), 1, 0)) %>%
        dplyr::mutate(Chain = as.integer( (((Chain - 1) * 2) + second.half) + 1)) %>%
        dplyr::mutate(Iteration = as.integer(ifelse(second.half == 1, Iteration - (original.attributes$nIterations / 2), Iteration))) %>%
        dplyr::select(Parameter, Chain, Iteration, value)
      # Yet again pass the attributes
      attr(D, "nChains") <- original.attributes$nChains * 2
      attr(D, "nParameters") <- original.attributes$nParameters
      attr(D, "nIterations") <- original.attributes$nIterations / 2
      attr(D, "nBurnin") <- original.attributes$nBurnin
      attr(D, "nThin") <- original.attributes$nThin
      attr(D, "description") <- original.attributes$description
    }
    # Check empty attributes that make sense to impute
    if (is.null(attributes(D)$nBurnin)) attr(D, "nBurnin") <- 0
    if (is.null(attributes(D)$nThin)) attr(D, "nThin") <- 1
    # Once everything is ready, return the processed object
    return(D)
  } else {
    stop("ggs is not able to transform the input object into a ggs object suitable for ggmcmc.")
  }
}

#' Auxiliary function that extracts information from a single chain.
#'
#' @param s a single chain to convert into a data frame
#' @return D data frame with the chain arranged
ggs_chain <- function(s) {
  # Get the number of samples and the vector of iterations
  n.samples <- dim(s)[1]
  iter <- 1:n.samples

  # Prepare the dataframe
  d <- data.frame(Iteration=iter, as.matrix(unclass(s)), check.names=FALSE)
  D <- d %>%
    tidyr::gather(Parameter, value, -Iteration)

  # Return the modified data frame as a tibble to be used by dplyr
  D <- dplyr::as_tibble(D)
  return(D)
}

#' Auxiliary function that sorts Parameter names taking into account numeric values
#'
#' @param x a character vector to which we want to sort elements
#' @return X a character vector sorted with family parametrs first and then numeric values
custom.sort <- function(x) {
  x <- sort(as.character(unique(x)))
  family <- gsub("\\[.+\\]", "", x)
  Families <- sort(unique(family))
  X <- NULL
  for (f in Families) {
    x.family <- x[family == f]
    if (length(grep("\\[", x.family)) > 0) {
      index <- gsub("]", "", gsub("(.+)\\[", "", x.family))
      if (length(grep(",", index) > 0)) { # multiple dimensional object
        idl <- data.frame(x.family = x.family, index = index,
          matrix(unlist(strsplit(index, ",")), nrow = length(index), byrow = TRUE))
        for (c in 3:(dim(idl)[2])) { # convert Xs' into numeric values
          idl[,c] <- as.numeric(as.character(idl[,c]))
        }
        command <- paste("dplyr::arrange(idl,", paste(names(idl)[-(c(1, 2))], collapse=","), ")", sep="")
        idl <- eval(parse(text = command))
        x.family <- as.character(idl$x.family)
      } else {
        x.family <- x.family[order(as.numeric((gsub("]", "", gsub("(.+)\\[", "", x.family)))))]
      }
      X <- c(X, x.family)
    } else {
      X <- c(X, x.family)
    }
  }
  return(X)
}