#------------------------------------------------------------------------------#
# Package info
#------------------------------------------------------------------------------#
#' mtraceR: an R package to analyse migrate-n output
#'
#' \code{mtraceR} can be used to visualise trace of MCMC chains, calculate
#' Effective Sample Size (ESS), Gelman and Rubin's convergence diagnostic (and
#' plots) or Bayes factor, and generate skyline plots.
#'
#' The main function \code{mtrace} is essentially a wrapper for the R package
#' \code{coda} optimised for migrate-n outputs. It is intended to facilitate and
#' speed up post-analysis data processing.
#'
#' COMPLETE DESCRIPTION/DETAILS
#'
#' @section Documentations:
#' Use \code{help(package = "mtraceR")} for a list of \code{mtraceR} functions
#' and their specific documentations.
#'
#' A more detailed description of the package and functions can be opened with:
#' \code{vignette(package="mtraceR", topic="mtraceR-package")}.
#'
#' More vignettes may be come available in the future. Use
#' \code{vignette(package="mtraceR")} to see all the available vignettes.
#'
#' @section Citation:
#' If you use \code{mtraceR}, please cite:
#' ...
#'
#'
#' @section Get in touch:
#' Please, use \url{https://github.org/carlopacioni/mtraceR/issues} to report
#' any issues with \code{mtraceR}. If unsure, or for feedback, contact us at:
#' carlo.pacioni 'at' gmail.com or anders???.
#'
#' @docType package
#'
#' @name mtraceR
NULL
#------------------------------------------------------------------------------#
# Data
#------------------------------------------------------------------------------#
#' @name mtDNA_4popsSt
#' @title Example of \code{mtrace} output
#' @description \code{mtrace} output from a run (with no replicate) using sequence
#' data (mtDNA) with 4 sampling sites connected by a stepping stone migration
#' model between either sites 1 or 2 and 3 and 4
#' (i.e. migration model: {* * 0 * * * 0 * 0 0 * * * * * *}).
#' @usage data(mtDNA_4popsSt)
#' @format a \code{list} with 3 named elelments.
#' @source Pacioni et al. In prep.
NULL
#' @name ms_3popsSt
#' @title Example of \code{mtrace} output
#' @description \code{mtrace} output from a run (with 7 replicates) using
#' microsatellite data with 4 sampling sites grouped in three populations
#' connected by a stepping stone migration model.
#' @usage data(ms_3popsSt)
#' @format a \code{list} with 3 named elelments.
#' @source Pacioni et al. In prep.
NULL
#' Diagnostics of migrate-n MCMC chains
#'
#' \code{mtrace} parses migrate-n output file (bayesallfile) and calculates the
#' Effective Sample Size (ESS) for each locus. If \code{save2dis=TRUE} these
#' are also saved to disk together with a PDF with the trace and density plots
#' for each locus and each replicate.
#'
#' If there is more than 1 replicate, \code{mtrace} returns also Gelman and
#' Rubin's convergence diagnostic for each locus. When \code{save2dis=TRUE}
#' relevant diagnostic plots are also saved as a separate PDF.
#'
#' When no bayesallfile name and directory are provided (default), the output
#' file is selected with an interactive window. Alternatively, when the directory
#' path is provided with \code{dir.in} and \code{bayesallfile=NULL}, the latter
#' is automatically set to "bayesallfile.gz". If the name of the file is provided,
#' but \code{dir.in=NULL}, the directory path is selected interactively.
#'
#' If the analysed was interrupted, or recovered, it is possible that \code{mtrace}
#' will throw an error. In such cases, it is possible to attempt repairing the file
#' by setting \code{fixcorruptBayesallfile=TRUE}. See further information on
#' \code{\link{cleanBayesAll}}.
#'
#' If \code{dir.out=NULL} (default) then \code{dir.out=dir.in}.
#'
#' @param heating Whether heating is used: (\code{TRUE} or \code{FALSE})
#' @param nchain Number of chains if \code{heating=TRUE}
#' @param burn.in Length of burn-in as proportion of MCMC (default: 0.1)
#' @param trim_params Whether trim data to estimated parameters (i.e. theta and M.
#' default: TRUE)
#' @param thin the thinning interval of recorded steps of trace plots
#' @param bayesallfile The name of the bayesallfile (default: NULL)
#' @param fixcorruptBayesallfile Whether the user believe the bayesallfile is corrupted
#' and wants to attempt to fix it. Default FALSE for back compatibility.
#' @param dir.in The local folder containing migrate-n output files (default: NULL)
#' @param dir.out The local path to store the results. If NULL (default) then
#' \code{dir.out=dir.in}
#' @param save2disk Whether to save results to disk (default: TRUE)
#' @return a list with the ESS for each locus, a summary of ESS across all loci
#' and Gelman's diagnostic if more replicates were run. See details for
#' additional outputs when \code{save2dis=TRUE}.
#' @import data.table
#' @import coda
#' @export
mtrace <- function(heating=TRUE, nchain=4, burn.in=0.1, trim_params=TRUE,
thin=10, bayesallfile=NULL, fixcorruptBayesallfile=FALSE,
dir.in=NULL,
dir.out=NULL, save2disk=TRUE) {
#----------------------------------------------------------------------------#
# Helper funstions
#----------------------------------------------------------------------------#
# Split data in bayesallfile by replicates
# x Element of a list passed with lapply()
byRepl <- function(x) {
split(x, x$Replicate)
}
# Remove first two columns (Steps & Locus)
# x Element of a list passed with lapply()
noLocus <- function(x) {
ncol <- dim(x[[1]])[2]
noloc <- lapply(x, function(slim) slim[, 3:ncol])
return(noloc)
}
# Convert object in mcmc
# x Element of a list passed with lapply()
make.mcmc <- function(x) {
x.mcmc <- lapply(x, mcmc)
return(x.mcmc)
}
# Remove burn-in
# x Element of a list passed with lapply()
# burn burn-in. This is calculate automatically based on burn.in
rm.burn <- function(x, burn) {
x.burn <- lapply(x, window, start=round(burn))
return(x.burn)
}
# Trim down data by removing unnecessary columns
# x Element of a list passed with lapply()
trim.data <- function(x, last.col) {
trimmed <- lapply(x, function(tr) tr[, 7:(last.col - 3)])
trimmed <- mcmc.list(trimmed)
return(trimmed)
}
# Gelman and Rubin's convergence diagnostic
# x Element of a list passed with lapply()
diagnostic <- function(x) {
x.diagn <- lapply(x, gelman.diag, autoburnin=FALSE)
return(x.diagn)
}
# Save to disk Gelman and Rubin's convergence diagnostic
# x A list where each element is the diagnostic for a locus
# i A sequence integer vector of length equal to the number of loci
save.gel.diag <- function(i, x) {
Locus <- rep(i, dim(x[[i]]$psrf)[1])
Parameter <- rownames(x[[i]]$psrf)
x.locus <- as.data.frame(x[[i]]$psrf)
x.locus <- cbind(Parameter, Locus, x.locus)
return(x.locus)
}
# Trace and density plots with locus name
# x A list where each element is a mcmc.list with replicates for one
# locus
# i A sequence integer vector of length equal to the number of loci
plot.with.name <- function(i, x) {
plot(x[[i]], ylab=paste("Locus", names(x)[i]), ask=FALSE)
}
# Plot of Gelman and Rubin's convergence diagnostic
# x A list where each element is a mcmc.list with replicates for one
# locus
# i A sequence integer vector of length equal to the number of loci
gelman.plot.with.name <- function(i, x) {
gelman.plot(x[[i]], ylab=paste("Locus", names(x)[i]), autoburnin=FALSE)
}
###---------------------------- Data handling -----------------------------###
# data.table 1.9.6 should work,
# data.table 1.9.5 though has an issues with the end of line
# so, for now, stick to read.table
if(is.null(bayesallfile) & is.null(dir.in)) {
message("Please, select the bayesallfile to import")
full.path <- file.choose()
bayesallfile <- basename(full.path)
dir.in <- dirname(full.path)
} else {
if(is.null(bayesallfile)) bayesallfile <- "bayesallfile.gz"
if(is.null(dir.in))
dir.in <- choose.dir("Select the folder where migate-n output files are")
}
if(is.null(dir.out)) dir.out <- dir.in
# attempt to fix bayesallfile if the user indicates it is corrupted
if(fixcorruptBayesallfile) cleanBayesAll(dir.in, bayesallfile)
message(paste("Parsing", bayesallfile))
rl <- readLines(paste0(dir.in, "/", bayesallfile), n=100)
patt <- grep(pattern = "^# @@@@@@@@", rl)
h <- make.names(read.table(paste0(dir.in, "/", bayesallfile),
header=F, skip=patt, nrow=1, colClasses="character"))
ncols <- dim(read.table(paste0(dir.in, "/", bayesallfile), header=FALSE,
nrows=10, skip=patt + 1, colClasses="numeric",
comment.char=""))[2]
col.c <- c(rep("numeric", length(h)), rep("NULL", ncols - length(h)))
col.n <- c(h, rep("NULL", ncols - length(h)))
data <- read.table(paste0(dir.in, "/", bayesallfile), header=FALSE,
skip=patt + 1, colClasses=col.c, comment.char="",
col.names=col.n)
parlast <- length(h) - 2 - if (heating == TRUE) nchain else 0
repl <- length(unique(data$Replicate))
if(repl > 1) {
message(paste("Detected", repl, "replicates"))
} else {
message("No replicates detected")
}
message("Done!")
if(save2disk) save(data,
file = file.path(dir.out,
paste0(sub(x = bayesallfile, pattern="\\.gz$",
replacement = ""),
".rda")))
message("Data processing started")
l.locus <- split(data[, 2:parlast], data$Locus)
l.locus.repl <- lapply(l.locus, byRepl)
burn <- burn.in * dim(l.locus.repl[[1]][[1]])[1]
l.locus.repl <- lapply(l.locus.repl, noLocus)
l.locus.mrepl <- lapply(l.locus.repl, make.mcmc)
l.locus.mreplno.burn <- lapply(l.locus.mrepl, rm.burn, burn=burn)
l.mlLocus.mrepl <- lapply(l.locus.mreplno.burn, mcmc.list)
if(repl > 1 | trim_params == TRUE) {
l.mlLocus.mrepltr <- lapply(l.mlLocus.mrepl, trim.data, last.col=parlast)
}
message("Done!")
###-------------------------- Gelman diagnostic ---------------------------###
if(repl > 1) {
message("Calculating Gelman's diagnostic")
g.diag <- lapply(l.mlLocus.mrepl, gelman.diag, autoburnin=FALSE,
multivariate = F)
message("Done!")
}
###-------------------------------- ESS -----------------------------------###
message("Calculating ESS after removal of burn-in...")
ESSlist <- lapply(l.mlLocus.mrepl, effectiveSize)
ESS <- as.data.frame(ESSlist)
names(ESS) <- paste("Locus", 1:length(ESSlist))
EES.summary <- apply(ESS, 1, summary)
message("Done!")
###---------------------------- save2disk ---------------------------------###
if(save2disk == TRUE) {
message("Saving reults to disk...")
write.csv(ESS, file=paste0(dir.out, "/", "ESS.csv"))
write.csv(EES.summary, file=paste0(dir.out, "/", "EES.summary.csv" ))
i <- seq_along(l.mlLocus.mrepl)
pdf(file=paste0(dir.out, "/", "TracePlots_burninRemoved.pdf"))
if(trim_params == TRUE) {
if(thin > 1) {
after.thin <- lapply(l.mlLocus.mrepltr, window, thin=thin)
lapply(i, plot.with.name, x=after.thin)
} else {
lapply(i, plot.with.name, x=l.mlLocus.mrepltr)
}
} else {
if(thin > 1) {
after.thin <- lapply(l.mlLocus.mrepl, window, thin=thin)
lapply(i, plot.with.name, x=after.thin)
} else {
lapply(i, plot.with.name, x=l.mlLocus.mrepl)
}
}
dev.off()
if(repl > 1) {
gelm.res <- lapply(i, save.gel.diag, x=g.diag)
write.csv(data.table::rbindlist(gelm.res),
file=paste0(paste0(dir.out, "/", "gelm.diag.csv")))
pdf(file=paste0(dir.out, "/", "gelm.plots.pdf"))
lapply(i, gelman.plot.with.name, x=l.mlLocus.mrepltr)
dev.off()
}
message("Done!")
}
###------------------------------------------------------------------------###
return(list(ESS=ESS,
EES.summary=EES.summary,
gelm.diag=if(repl > 1) g.diag else NULL))
}
#' Calculate Bayes Factor
#'
#' \code{BF} calculates log Bayes factors (LBF) of all models compared against
#' the model that has the highest log likelihood. The models are then ranked based
#' on LBF (mod.rank) and the model probability (mod.prob) is calculated.
#'
#' A LBF value of -2 (or smaller) indicates support for reference model (the
#' model that has the highest log likelihood). Values smaller than -6 indicate a
#' strong evidence in favour of the reference model (Kass and Raftery 1995).
#'
#' @param lmLs Numeric vector with log marginal likelihood values of the models
#' @return A data.frame with log Bayes factors (LBF), the models' ranks
#' (mod.rank) and the models' probability (mod.prob).
#' @export
#' @examples
#' # From migrate-n tutotial: 'Comparison of gene flow models using
#' # Bayes Factors' (Beerli 2010)
#' mod.comp <- BF(c(-4862.85, -4860.58, -4863.08, -4887.25))
#' mod.comp
BF <- function(lmLs=NULL) {
# error handling
if(!is.numeric(lmLs)) stop("Please, pass a numeric vector of log
likelihood with the argument lmLs")
ref.mod <- max(lmLs)
LBF <- 2 * (lmLs - ref.mod)
BF <- exp(LBF )
mod.prob <- round(BF / sum(BF), digits=3)
mod.rank <- rank( - LBF)
return(data.frame(lmL=lmLs, LBF, mod.rank, mod.prob))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.