Nothing
#' Import MCMC samples into a ggs object than can be used by all ggs_* graphical functions.
#'
#' This function manages MCMC samples from different sources (JAGS, MCMCpack, STAN -both via rstan and via csv files-) and converts them into a data frame tibble. The resulting data frame has four columns (Iteration, Chain, Parameter, value) and six attributes (nChains, nParameters, nIterations, nBurnin, nThin and description). The ggs object returned is then used as the input of the ggs_* functions to actually plot the different convergence diagnostics.
#'
#' @references Fernández-i-Marín, Xavier (2016) ggmcmc: Analysis of MCMC Samples and Bayesian Inference. Journal of Statistical Software, 70(9), 1-20. doi:10.18637/jss.v070.i09
#' @references Gelman, Carlin, Stern, Dunson, Vehtari and Rubin (2014) Bayesian Data Analysis. 3rd edition. Chapman & Hall/CRC, Boca Raton.
#' @param S Either a \code{mcmc.list} object with samples from JAGS, a \code{mcmc} object with samples from MCMCpack, a \code{stanreg} object with samples from rstanarm, a \code{brmsfit} object with samples from brms, a \code{stanfit} object with samples from rstan, or a list with the filenames of \code{csv} files generated by stan outside rstan (where the order of the files is assumed to be the order of the chains). ggmcmc guesses what is the original object and tries to import it accordingly. rstan is not expected to be in CRAN soon, and so coda::mcmc is used to extract stan samples instead of the more canonical rstan::extract.
#' @param family Name of the family of parameters to process, as given by a character vector or a regular expression. A family of parameters is considered to be any group of parameters with the same name but different numerical value between square brackets (as beta[1], beta[2], etc).
#' @param description Character vector giving a short descriptive text that identifies the model.
#' @param burnin Logical or numerical value. When logical and TRUE (the default), the number of samples in the burnin period will be taken into account, if it can be guessed by the extracting process. Otherwise, iterations will start counting from 1. If a numerical vector is given, the user then supplies the length of the burnin period.
#' @param par_labels data frame with two colums. One named "Parameter" with the same names of the parameters of the model. Another named "Label" with the label of the parameter. When missing, the names passed to the model are used for representation. When there is no correspondence between a Parameter and a Label, the original name of the parameter is used. The order of the levels of the original Parameter does not change.
#' @param sort Logical. When TRUE (the default), parameters are sorted first by family name and then by numerical value.
#' @param keep_original_order Logical. When TRUE, parameters are sorted using the original order provided by the source software. Defaults to FALSE.
#' @param splitting Logical. When TRUE, use the approach suggested by Gelman, Carlin, Stern, Dunson, Vehtari and Rubin (2014) Bayesian Data Analysis. 3rd edition. This implies splitting the sequences (original chains) in half, and treat each half as a different Chain, therefore effectively doubling the number of chains. In this case, the first half of Chain 1 is still Chain 1 , but the second half is turned into Chain 2, and the first half of Chain 2 into Chain 3, and so on. Defaults to FALSE.
#' @param inc_warmup Logical. When dealing with stanfit objects from rstan, logical value whether the warmup samples are included. Defaults to FALSE.
#' @param stan_include_auxiliar Logical value to include "lp__" parameter in rstan, and "lp__", "treedepth__" and "stepsize__" in stan running without rstan. Defaults to FALSE.
#' @export
#' @return D A data frame tibble with the data arranged and ready to be used by the rest of the \code{ggmcmc} functions. The data frame has four columns, namely: Iteration, Chain, Parameter and value, and six attributes: nChains, nParameters, nIterations, nBurnin, nThin and description. A data frame tibble is a wrapper to a local data frame, behaves like a data frame and its advantage is related to printing, which is compact. For more details, see \code{as_tibble()} in package \code{dplyr}.
#' @examples
#' # Assign 'S' to be a data frame suitable for \code{ggmcmc} functions from
#' # a coda object called s
#' data(linear)
#' S <- ggs(s) # s is a coda object
#'
#' # Get samples from 'beta' parameters only
#' S <- ggs(s, family = "beta")
ggs <- function(S, family = NA, description = NA, burnin = TRUE, par_labels = NA,
sort = TRUE, keep_original_order = FALSE, splitting = FALSE,
inc_warmup = FALSE, stan_include_auxiliar = FALSE) {
processed <- FALSE # set by default that there has not been any processed samples
#
# Manage stanfit obcjets
# Manage stan output first because it is firstly converted into an mcmc.list
#
original.object.class <- class(S)[1]
if (length(class(S)) > 1 & class(S)[1] == "stanreg") {
S <- S$stanfit
# From now on, the original object with multiple classes only has one class
}
if (class(S)== "brmsfit") {
S <- S$fit
}
if (class(S)=="stanfit") {
# Extract chain by chain
nChains <- S@sim$chains
nThin <- S@sim$thin
mDescription <- S@model_name
D <- NULL
for (l in 1:nChains) {
sdf <- as.data.frame(S@sim$samples[[l]])
names(sdf) <- names(S@sim$samples[[l]])
sdf$Iteration <- 1:dim(sdf)[1]
s <- tidyr::gather(sdf, Parameter, value, -Iteration) %>%
dplyr::mutate(Chain = l) %>%
dplyr::select(Iteration, Chain, Parameter, value)
D <- dplyr::bind_rows(D, s)
}
if (!inc_warmup) {
if (original.object.class == "stanfit") {
D <- dplyr::filter(D, Iteration > (S@sim$warmup / nThin))
D$Iteration <- D$Iteration - (S@sim$warmup / nThin)
}
nBurnin <- S@sim$warmup
} else {
nBurnin <- 0
}
# Exclude, by default, lp parameter
if (!stan_include_auxiliar) {
D <- dplyr::filter(D, Parameter!="lp__") # delete lp__
}
if (sort) {
D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
} else {
D$Parameter <- factor(D$Parameter)
}
processed <- TRUE
D <- dplyr::as_tibble(D)
}
#
# Manage csv files than contain stan samples
# Also converted first to an mcmc.list
#
if (class(S)=="list") {
D <- NULL
for (i in 1:length(S)) {
samples.c <- dplyr::as_tibble(read.table(S[[i]], sep=",", header=TRUE,
colClasses="numeric", check.names=FALSE))
D <- dplyr::bind_rows(D,
tidyr::gather(samples.c, Parameter) %>%
dplyr::mutate(Iteration=rep(1:(dim(samples.c)[1]), dim(samples.c)[2]), Chain=i) %>%
dplyr::select(Iteration, Chain, Parameter, value))
}
# Exclude, by default, lp parameter and other auxiliar ones
if (!stan_include_auxiliar) {
D <- D[grep("__$", D$Parameter, invert=TRUE),]
if (sort) {
D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
} else {
D$Parameter <- factor(D$Parameter)
}
}
nBurnin <- as.integer(gsub("warmup=", "", scan(S[[i]], "", skip=12, nlines=1, quiet=TRUE)[2]))
nThin <- as.integer(gsub("thin=", "", scan(S[[i]], "", skip=13, nlines=1, quiet=TRUE)[2]))
processed <- TRUE
}
#
# Manage mcmc.list and mcmc objects
#
if (class(S)=="mcmc.list" | class(S)=="mcmc" | processed) { # JAGS typical output or MCMCpack (or previously processed stan samples)
if (!is.na(family)) {
requireNamespace("coda")
if (!processed) {
location.family <- grep(family, dimnames(S[[1]])[[2]])
S <- S[,location.family, drop = FALSE]
} else { # for already processed samples, use dplyr()'s own filter based on grepl
D <- D %>%
filter(grepl(family, Parameter)) %>%
mutate(Parameter = as.factor(as.character(Parameter)))
}
}
if (!processed) { # only in JAGS or MCMCpack, using coda
lS <- length(S)
D <- NULL
if (lS == 1 | class(S)=="mcmc") { # Single chain or MCMCpack
if (lS == 1 & class(S)=="mcmc.list") { # single chain
s <- S[[1]]
} else { # MCMCpack
s <- S
}
# Keep a record of original names
if (keep_original_order) {
parameter.names.original.order <- dimnames(s)[[2]]
}
# Process a single chain
D <- dplyr::mutate(ggs_chain(s), Chain=1) %>%
dplyr::select(Iteration, Chain, Parameter, value)
# Get information from mcpar (burnin period, thinning)
nBurnin <- (attributes(s)$mcpar[1])-(1*attributes(s)$mcpar[3])
nThin <- attributes(s)$mcpar[3]
} else {
# Keep a record of original names
if (keep_original_order) {
parameter.names.original.order <- dimnames(S[[1]])[[2]]
}
# Process multiple chains
for (l in 1:lS) {
s <- S[l][[1]]
D <- dplyr::bind_rows(D, dplyr::mutate(ggs_chain(s), Chain=l))
}
D <- dplyr::select(D, Iteration, Chain, Parameter, value)
# Get information from mcpar (burnin period, thinning). Taking the last
# chain is fine. All chains are assumed to have the same structure.
nBurnin <- (attributes(s)$mcpar[1])-(1*attributes(s)$mcpar[3])
nThin <- attributes(s)$mcpar[3]
}
if (sort) {
D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
} else {
D$Parameter <- factor(D$Parameter)
}
if (keep_original_order) {
D$Parameter <- factor(D$Parameter, levels = parameter.names.original.order)
}
D <- dplyr::arrange(D, Parameter, Chain, Iteration)
}
# Set several attributes to the object, to avoid computations afterwards
# Number of chains
attr(D, "nChains") <- length(unique(D$Chain))
# Number of parameters
attr(D, "nParameters") <- length(unique(D$Parameter))
# Number of Iterations really present in the sample
attr(D, "nIterations") <- max(D$Iteration)
# Number of burning periods previously
if (is.numeric(burnin) & length(burnin)==1) {
attr(D, "nBurnin") <- burnin
} else if (is.logical(burnin)) {
if (burnin) {
attr(D, "nBurnin") <- nBurnin
} else {
attr(D, "nBurnin") <- 0
}
} else {
stop("burnin must be either logical (TRUE/FALSE) or a numerical vector of length one.")
}
# Thinning interval
attr(D, "nThin") <- nThin
# Descriptive text
if (is.character(description)) { # if the description is given, us it when it is a character string
attr(D, "description") <- description
} else {
if (!is.na(description)) { # if it is not a character string and not NA, show an informative message
message("description is not a text string. The name of the imported object is used instead.")
}
if (exists("mDescription")) { # In case of stan model names
attr(D, "description") <- mDescription
} else {
attr(D, "description") <- as.character(sys.call()[2]) # use the name of the source object
}
}
# Manage subsetting a family of parameters
# In order to save memory, the exclusion of parameters would be done ideally
# at the beginning of the processing, but then it has to be done for all
# input types.
if (!is.na(family)) {
D <- get_family(D, family=family)
}
# Change the names of the parameters if par_labels argument has been passed
if (length(which(class(par_labels) %in% c("data.frame", "tbl_df"))) >= 1) {
if (length(which(c("Parameter", "Label") %in% names(par_labels))) == 2) {
aD <- attributes(D)
levels(D$Parameter)[which(levels(D$Parameter) %in% par_labels$Parameter)] <-
as.character(par_labels$Label[
match(levels(D$Parameter)[which(levels(D$Parameter) %in% par_labels$Parameter)], par_labels$Parameter)])
L <- dplyr::as_tibble(data.frame(Parameter = par_labels$Label, ParameterOriginal = par_labels$Parameter)) %>%
mutate(Parameter = as.character(Parameter)) # dplyr chrashes as of 160428 development version if Parameter is factor
D <- suppressWarnings(dplyr::left_join(D, L, by = "Parameter"))
D <- D %>%
dplyr::select(Iteration, Chain, Parameter, value, ParameterOriginal)
if (class(D$Parameter) == "character") {
if (sort) {
D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
} else {
D$Parameter <- factor(D$Parameter)
}
}
# Unfortunately, the attributes are not inherited in left_join(), so they have to be manually passed again
attr(D, "nChains") <- aD$nChains
attr(D, "nParameters") <- aD$nParameters
attr(D, "nIterations") <- aD$nIterations
attr(D, "nBurnin") <- aD$nBurnin
attr(D, "nThin") <- aD$nThin
attr(D, "description") <- aD$description
# Keep the rest of the variables passed if the data frame has more than Parameter and Label
if (dim(par_labels)[2] > 2) {
aD <- attributes(D)
L.noParameter <- dplyr::as_tibble(par_labels) %>%
dplyr::select(-Parameter) %>%
dplyr::mutate(Label = as.character(Label))
D <- suppressWarnings(dplyr::left_join(D, L.noParameter, by=c("Parameter" = "Label")))
if (class(D$Parameter) == "character") {
if (sort) {
D$Parameter <- factor(D$Parameter, levels=custom.sort(D$Parameter))
} else {
D$Parameter <- factor(D$Parameter)
}
}
}
# Unfortunately, the attributes are not inherited in left_join(), so they have to be manually passed again (for second time).
attr(D, "nChains") <- aD$nChains
attr(D, "nParameters") <- aD$nParameters
attr(D, "nIterations") <- aD$nIterations
attr(D, "nBurnin") <- aD$nBurnin
attr(D, "nThin") <- aD$nThin
attr(D, "description") <- aD$description
} else {
stop("par_labels must include at least columns called 'Parameter' and 'Label'.")
}
} else {
if (!is.na(par_labels)) {
stop("par_labels must be a data frame or a tibble.")
}
}
# Perform the splitting into chain halves
if (splitting) {
original.attributes <- attributes(D)
# If the original Chain is odd, discard the last one before proceeding
if ((original.attributes$nIterations %% 2) != 0) {
D <- dplyr::filter(D, Iteration < max(Iteration))
original.attributes$nIterations <- original.attributes$nIterations - 1
}
D <- D %>%
dplyr::mutate(second.half = ifelse(Iteration > (original.attributes$nIterations / 2), 1, 0)) %>%
dplyr::mutate(Chain = as.integer( (((Chain - 1) * 2) + second.half) + 1)) %>%
dplyr::mutate(Iteration = as.integer(ifelse(second.half == 1, Iteration - (original.attributes$nIterations / 2), Iteration))) %>%
dplyr::select(Parameter, Chain, Iteration, value)
# Yet again pass the attributes
attr(D, "nChains") <- original.attributes$nChains * 2
attr(D, "nParameters") <- original.attributes$nParameters
attr(D, "nIterations") <- original.attributes$nIterations / 2
attr(D, "nBurnin") <- original.attributes$nBurnin
attr(D, "nThin") <- original.attributes$nThin
attr(D, "description") <- original.attributes$description
}
# Check empty attributes that make sense to impute
if (is.null(attributes(D)$nBurnin)) attr(D, "nBurnin") <- 0
if (is.null(attributes(D)$nThin)) attr(D, "nThin") <- 1
# Once everything is ready, return the processed object
return(D)
} else {
stop("ggs is not able to transform the input object into a ggs object suitable for ggmcmc.")
}
}
#' Auxiliary function that extracts information from a single chain.
#'
#' @param s a single chain to convert into a data frame
#' @return D data frame with the chain arranged
ggs_chain <- function(s) {
# Get the number of samples and the vector of iterations
n.samples <- dim(s)[1]
iter <- 1:n.samples
# Prepare the dataframe
d <- data.frame(Iteration=iter, as.matrix(unclass(s)), check.names=FALSE)
D <- d %>%
tidyr::gather(Parameter, value, -Iteration)
# Return the modified data frame as a tibble to be used by dplyr
D <- dplyr::as_tibble(D)
return(D)
}
#' Auxiliary function that sorts Parameter names taking into account numeric values
#'
#' @param x a character vector to which we want to sort elements
#' @return X a character vector sorted with family parametrs first and then numeric values
custom.sort <- function(x) {
x <- sort(as.character(unique(x)))
family <- gsub("\\[.+\\]", "", x)
Families <- sort(unique(family))
X <- NULL
for (f in Families) {
x.family <- x[family == f]
if (length(grep("\\[", x.family)) > 0) {
index <- gsub("]", "", gsub("(.+)\\[", "", x.family))
if (length(grep(",", index) > 0)) { # multiple dimensional object
idl <- data.frame(x.family = x.family, index = index,
matrix(unlist(strsplit(index, ",")), nrow = length(index), byrow = TRUE))
for (c in 3:(dim(idl)[2])) { # convert Xs' into numeric values
idl[,c] <- as.numeric(as.character(idl[,c]))
}
command <- paste("dplyr::arrange(idl,", paste(names(idl)[-(c(1, 2))], collapse=","), ")", sep="")
idl <- eval(parse(text = command))
x.family <- as.character(idl$x.family)
} else {
x.family <- x.family[order(as.numeric((gsub("]", "", gsub("(.+)\\[", "", x.family)))))]
}
X <- c(X, x.family)
} else {
X <- c(X, x.family)
}
}
return(X)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.