#' (Modified) TWINSPAN in R
#'
#' Calculates TWINSPAN (TWo-way INdicator SPecies ANalaysis, Hill 1979) and its modified version according to Rolecek et al. (2009)
#' @author Mark O. Hill wrote the original Fortran code, which has been compiled by Stephan M. Hennekens into \emph{twinspan.exe} to be used within his application MEGATAB (which was, in the past, part of Turboveg for Windows; Hennekens & Schaminee 2001). This version of \emph{twinspan.exe} was later used also in JUICE program (Tichy 2002) and fixed by Petr Smilauer for issues related to order instability. The \code{twinspanR} package was written by David Zeleny (zeleny.david@@gmail.com); it is basically an R wrapper around \emph{twinspan.exe} program maintaining the communication between \emph{twinspan.exe} and R, with some added functionality (e.g. implementing the algorithm of modified TWINSPAN by Rolecek et al. 2009).
#' @name twinspan
#' @param com Community data (\code{data.frame} or \code{matrix}).
#' @param modif Should the modified TWINSPAN algorithm be used? (logical, value, default = FALSE, i.e. standard TWINSPAN)
#' @param cut.levels Pseudospecies cut levels (default = \code{c(0,2,5,10,20)}). Should not exceed 9 cut levels.
#' @param min.group.size Minimum size of the group, which should not be further divided (default = 5).
#' @param levels Number of hierarchical levels of divisions (default = 6, should be between 0 and 15). Applies only for standard TWINSPAN (\code{modif = FALSE}).
#' @param clusters Number of clusters generated by modified TWINSPAN (default = 5). Applies only for modified TWINSPAN (\code{modif = TRUE}).
#' @param diss Dissimilarity (default = 'bray') used to calculate cluster heterogeneity for modified TWINSPAN (\code{modif = TRUE}). Available options are: \code{'total.inertia'} for total inertia measured by correspondence analysis; \code{'whittaker'} for Whittaker's multiplicative measure of beta diversity; \code{'bray'}, \code{'jaccard'} and some other (see Details) for pairwise measure of betadiversity; \code{'multi.jaccard'} and \code{'multi.sorensen'} for multi-site measures of beta diversity (sensu Baselga et al. 2007). Details for more information. Applies only for modified TWINSPAN (\code{modif = TRUE}).
#' @param min.diss Minimum dissimilarity under which the cluster will not be divided further (default = NULL, which means that the stopping rule is based on number of clusters (parameter \code{clusters})). Currently not implemented.
#' @param mean.median Should be the average dissimilarity of cluster calculated as mean or median of all between sample dissimilarities within the cluster? (default = \code{'mean'}, alternative is \code{'median'})
#' @param show.output.on.console Logical; should the function communicating with \code{twinspan.exe} show the output (rather long) of TWINSPAN program on console? Default = \code{FALSE}. Argument passsed via function \code{shell} into \code{system}.
#' @param quiet Logical; should the function reading TWINSPAN output files (tw.TWI and tw.PUN) be quiet and not report on console number of items it has read? Default = \code{TRUE}, means the function is quiet. Argument passed into function \code{scan}.
#' @param object Object of the class \code{'tw'}.
#' @param ... Other (rarely used) TWINSPAN parameters passed into function \code{\link{create.tw.dat}} (see the help file of this function for complete list of modifiable arguments).
#' @details The function \code{twinspan} calculates TWINSPAN classification algorithm introduced by Hill (1979) and alternatively also modified TWINSPAN algorithm introduced by Rolecek et al. (2009). It generates object of the class \code{tw}, with generic \code{print} function printing results, \code{summary} for overview of parameters and \code{cut} defining which sample should be classified into which cluster.
#'
#' Default values for arguments used in \code{twinspan} function (e.g. definition of pseudospecies cut levels, number of hierarchical divisions etc.) are the same as the default values of the original TWINSPAN program (Hill 1979) and also WinTWINS (Hill & Smilauer 2005).
#'
#' When calculating modified TWINSPAN (\code{modif = TRUE}), one needs to choose number of target clusters (argument \code{cluster}) instead of hierarchical levels of divisions as in standard TWINSPAN (argument \code{levels}), and also the measure of dissimilarity (\code{diss}) to calculate heterogeneity of clusters at each step of division (the most heterogeneous one is divided in the next step). Several groups of beta diversity measures are currently implemented:
#' \itemize{
#' \item\code{'total.inertia'} - total inertia, calculated by correspondence analysis (\code{cca} function from \code{vegan}) and applied on original quantitative species data (abundances);
#' \item\code{'whittaker'} - Whittaker's beta diversity, calculated as gamma/mean(alpha)-1 (Whittaker 1960), applied on species data transformed into presences-absences;
#' \item\code{'manhattan'}, \code{'euclidean'}, \code{'canberra'}, \code{'bray'}, \code{'kulczynski'}, \code{'jaccard'}, \code{'gower'}, \code{'altGower'}, \code{'morisita'}, \code{'horn'}, \code{'mountford'}, \code{'raup'}, \code{'binomial'}, \code{'chao'}, \code{'cao'} or \code{'mahalanobis'} - mean of beta diversities calculated among pairs of samples - argument is passed into argument \code{diss} in \code{\link{vegdist}} function of \code{vegan}, applied on original quantitative species data (abundances);
#' \item\code{'multi.sorensen'} or \code{'multi.jaccard'} - multi-site beta diversity, calculated from group of sites according to Baselga et al. (2007) and using function \code{beta.multi} from library \code{betapart}.
#' }
#'
#' If the row names in community matrix (\code{com}) contain spaces, these names will be transformed into syntactically valid names using function \code{make.names} (syntactically valid name consists of letters, numbers and the dot or underline characters and starts with a letter or the dot not followed by a number).
#'
#' Arguments \code{show.output.on.console} and \code{quiet} regulates how "verbal" will be \code{twinspan} function while running. Default setting (\code{show.output.on.console = FALSE, quiet = TRUE}) supress all the output. Setting \code{quiet = FALSE} has only minor effect - it reports how many items have been read in each step of analysis from the output files (tw.TWI and tw.PUN) using function \code{scan} (the argument \code{quiet} is directly passed into this function). In contrary setting \code{show.output.on.console = TRUE} prints complete output generated by \code{twinspan.exe} program on console. Argument \code{show.output.on.console} has similar behavior as the argument of the same name in function \code{\link{system}}, but the value is not directly passed to this function. Output could be captured using function \code{capture.output} from package \code{utils} - see examples below.
#'
#'
#' @return \code{twinspan} returns object of the class \code{'tw'}, which is a list with the following items:
#' \itemize{
#' \item \code{classif} data frame with three columns: \code{order} - sequential number of plot, \code{plot.no} - original number of plot (\code{row.names} in community matrix \code{com}), \code{class} - binomial code with hieararchical result of TWINSPAN classification.
#' \item \code{twi} vector (if \code{modif = FALSE}) or list (if \code{modif = TRUE}) of complete machine-readable output of TWINSPAN algorithm read from *.TWI file. In case of modified TWINSPAN (\code{modif = TRUE}) it is a list with number of items equals to number of clusters.
#' \item \code{spnames} data frame with two columns: \code{full.name} - full scientific species name (\code{names (com)}), \code{abbrev.name} - eight-digits abbreviation created by \code{make.cepnames} function from \code{vegan}.
#' \item \code{modif} logical; was the result calculated using standard TWINSPAN (\code{modif = FALSE}) or its modified version (\code{modif = TRUE})?
#' }
#' @references \itemize{
#' \item Baselga A., Jimenez-Valverde A. & Niccolini G. (2007): A multiple-site similarity measure independent of richness. \emph{Biology Letters}, 3: 642-645.
#' \item Hennekens S.M. & Schaminee J.H.J. (2001): TURBOVEG, a comprehensive data base management system for vegetation data. \emph{Journal of Vegetation Science}, 12: 589-591.
#' \item Hill M.O. (1979): \emph{TWINSPAN - A FORTRAN program for arranging multivariate data in an ordered two-way table by classification of the individuals and attributes}. Section of Ecology and Systematics, Cornel University, Ithaca, New York.
#' \item Hill M.O. & Smilauer P. (2005): \emph{TWINSPAN for Windows version 2.3}. Centre for Ecology and Hydrology & University of South Bohemia, Huntingdon & Ceske Budejovice.
#' \item Rolecek J., Tichy L., Zeleny D. & Chytry M. (2009): Modified TWINSPAN classification in which the hierarchy respects cluster heterogeneity. \emph{Journal of Vegetation Science}, 20: 596-602.
#' \item Tichy L. (2002): JUICE, software for vegetation classification. \emph{Journal of Vegetation Science}, 13: 451-453.
#' \item Whittaker R.H. (1960): Vegetation of the Siskiyou mountains, Oregon and California. \emph{Ecological Monographs}, 30:279-338.
#' }
#' @examples
#' ## Modified TWINSPAN on traditional Ellenberg's Danube meadow dataset, projected on DCA
#' ## and compared with original classification into three vegetation types made by tabular sorting:
#' library (twinspanR)
#' library (vegan)
#' data (danube)
#' res <- twinspan (danube$spe, modif = TRUE, clusters = 4)
#' k <- cut (res)
#' dca <- decorana (danube$spe)
#' par (mfrow = c(1,2))
#' ordiplot (dca, type = 'n', display = 'si', main = 'Modified TWINSPAN')
#' points (dca, col = k)
#' for (i in c(1,2,4)) ordihull (dca, groups = k, show.group = i, col = i,
#' draw = 'polygon', label = TRUE)
#' ordiplot (dca, type = 'n', display = 'si', main = 'Original assignment\n (Ellenberg 1954)')
#' points (dca, col = danube$env$veg.type)
#' for (i in c(1:3)) ordihull (dca, groups = danube$env$veg.type,
#' show.group = unique (danube$env$veg.type)[i], col = i,
#' draw = 'polygon', label = TRUE)
#'
#' ## To capture the console output of twinspan.exe into R object, use the following:
#' \dontrun{
#' out <- capture.output (tw <- twinspan (danube$spe, show.output.on.console = T))
#' summary (tw) # returns summary of twinspan algorithm
#' cat (out, sep = '\n') # prints the captured output
#' write.table (out, file = 'out.txt', quot = F, row.names = F) # writes output to 'out.txt' file
#' }
#' @seealso \code{\link{create.tw.dat}}, \code{\link{cut.tw}}, \code{\link{print.tw}}.
#' @importFrom riojaExtra write.CEP
#' @importFrom stats median
#' @importFrom vegan make.cepnames cca vegdist
#' @importFrom betapart beta.multi
#' @export
twinspan <- function (com, modif = F, cut.levels = c(0,2,5,10,20), min.group.size = 5, levels = 6, clusters = 5, diss = 'bray', min.diss = NULL, mean.median = 'mean', show.output.on.console = FALSE, quiet = TRUE, ...)
{
# DEFINITION OF FUNCTIONS
cluster.heter.fun <- function (com, tw.class.level, diss, mean.median)
unlist (lapply (sort (unique (tw.class.level)), FUN = function (x)
{
if (diss == 'total.inertia') vegan::cca (com[tw.class.level == x, ])$tot.chi else
if (diss == 'whittaker') {com.t <- vegan::decostand (com[tw.class.level == x, ], 'pa'); gamma <- sum (colSums (com.t) > 0); alpha <- mean (rowSums (com.t)); (gamma/alpha)-1} else
if (diss == 'multi.jaccard' || diss == 'multi.sorensen') {com.t <- vegan::decostand (com[tw.class.level == x, ], 'pa'); betapart::beta.multi (com.t, index.family = unlist (strsplit ('multi.sorensen', split = '.', fixed = T))[2])[[3]]} else
if (mean.median == 'mean') mean (vegan::vegdist (com[tw.class.level == x,], method = diss)) else median (vegan::vegdist (com[tw.class.level == x,], method = diss))
}))
#PREPARATION OF DATA
DISS <- c("manhattan", "euclidean", "canberra", "bray",
"kulczynski", "gower", "morisita", "horn", "mountford",
"jaccard", "raup", "binomial", "chao", "altGower", "cao",
"mahalanobis", "total.inertia", "whittaker", "multi.jaccard", "multi.sorensen")
diss <- pmatch(diss, DISS)
if (is.na (diss)) stop ('invalid distance method')
if (diss == -1) stop ('ambiguous distance method')
diss <- DISS[diss]
com <- as.data.frame (com)
species.names <- names (com)
tw.heter <- list ()
names (com) <- vegan::make.cepnames (species.names)
rownames (com) <- make.names (rownames (com)) # this is added to fix the bug with cut.tw, which requires not to use white spaces in row names.
# STANDARD TWINSPAN
if (!modif) tw <- twinspan0 (com, cut.levels = cut.levels, min.group.size = min.group.size, levels = levels, show.output.on.console = show.output.on.console, quiet = quiet, ...)
# MODIFIED TWINSPAN
if (modif)
{
groups01 <- matrix (ncol = clusters-1, nrow = nrow (com))
tw.temp <- list ()
tw <- list ()
tw.temp[[1]] <- twinspan0 (com, cut.levels = cut.levels, min.group.size = min.group.size, levels = 1, show.output.on.console = show.output.on.console, quiet = quiet, ...)
tw.heter[[1]] <- list (tw.class.level = 1, cluster.heter = cluster.heter.fun (com = com, tw.class.level = rep (1, nrow (com)), diss = diss, mean.median = mean.median), no.samples.per.group = nrow (com), which.most.heter = 1)
groups01[,1] <- cut (tw.temp[[1]], level = 1)-1
clusters.temp <- 2
while (clusters.temp != clusters)
{
tw.class.level <- as.numeric (as.factor (apply (groups01[, 1:clusters.temp-1, drop = F], 1, paste, collapse = '')))
cluster.heter <- cluster.heter.fun (com = com, tw.class.level = tw.class.level, diss = diss, mean.median = mean.median)
sort.by.heter <- sort (unique (tw.class.level))[order (cluster.heter,decreasing = T)]
no.samples.per.group <- unlist (lapply (sort.by.heter, FUN = function (no) sum (tw.class.level == no)))
which.most.heter <- sort.by.heter[no.samples.per.group >= min.group.size][1]
tw.heter[[clusters.temp]] <- list (tw.class.level = sort(unique(tw.class.level)), cluster.heter = cluster.heter, no.samples.per.group = no.samples.per.group[order (sort.by.heter)], which.most.heter = which.most.heter)
tw.temp[[clusters.temp]] <- twinspan0 (com[tw.class.level == which.most.heter,], cut.levels = cut.levels, min.group.size = min.group.size, levels = 1, show.output.on.console = show.output.on.console, quiet = quiet, ...)
groups01[,clusters.temp] <- groups01[,clusters.temp-1]
groups01[tw.class.level == which.most.heter, clusters.temp] <- cut (tw.temp[[clusters.temp]], level = 1)-1
clusters.temp <- clusters.temp + 1
}
# for the last group (which is not going to be divided further)
tw.class.level <- as.numeric (as.factor (apply (groups01[, 1:clusters.temp-1, drop = F], 1, paste, collapse = '')))
cluster.heter <- cluster.heter.fun (com = com, tw.class.level = tw.class.level, diss = diss, mean.median = mean.median)
sort.by.heter <- sort (unique (tw.class.level))[order (cluster.heter,decreasing = T)]
no.samples.per.group <- unlist (lapply (sort.by.heter, FUN = function (no) sum (tw.class.level == no)))
which.most.heter <- sort.by.heter[no.samples.per.group >= min.group.size][1]
tw.heter[[clusters.temp]] <- list (tw.class.level = sort(unique(tw.class.level)), cluster.heter = cluster.heter, no.samples.per.group = no.samples.per.group[order (sort.by.heter)], which.most.heter = which.most.heter)
res <- list (order = tw.temp[[1]]$classif$order, plot.no = tw.temp[[1]]$classif$plot.no, class = apply (groups01, 1, FUN = function (x) paste ('*', paste(x, collapse = ''), sep = '')))
tw$classif <- as.data.frame (res)
class (tw) <- c('tw')
tw$twi <- lapply (tw.temp, FUN = function (x) x$twi)
}
tw$spnames <- as.data.frame (cbind (full.name = species.names, abbrev.name = vegan::make.cepnames (species.names)))
tw$modif <- modif
tw$summary <- list (modif = modif, cut.levels = cut.levels, min.group.size = min.group.size, levels = levels, clusters = clusters, diss = diss, min.diss = min.diss, mean.median = mean.median, heter = tw.heter)
class (tw) <- 'tw'
return (tw)
}
twinspan0 <- function (com, cut.levels, min.group.size, levels, show.output.on.console, quiet, ...)
{
actual.wd <- getwd ()
# setwd (paste (.libPaths (), '/twinspanR/exec/', sep = ''))
setwd (paste (find.package ('twinspanR'), '/exec/', sep = ''))
if (is.integer (com[1,1])) com <- sapply (com, as.numeric) # if the data frame contains integers instead of reals, it converts them to real (because of write.CEP in riojaExtra can't handle integers)
com <- com[,colSums (com) > 0]
riojaExtra::write.CEP (com, fName = 'tw.cc!')
create.tw.dat (cut.levels = cut.levels, min.group.size = min.group.size, levels = levels, ...)
output <- shell ('tw.bat', intern = T)
if (show.output.on.console)
{
output [output %in% ' \f'] <- '\n'
output [output %in% '\f Input parameters:'] <- ' Input parameters:'
output [output %in% '\f Name of inputfile? '] <- '\n Name of inputfile? '
cat (output[], sep = '\n')
}
scanned.PUN <- scan (file = 'tw.PUN', what = 'raw', quiet = quiet)
scanned.TWI <- scan (file = 'tw.TWI', what = 'raw', sep = '\n', quiet = quiet)
if (length (scanned.PUN) == 0) res <- 0 else res <- scanned.PUN[10:(which (scanned.PUN == 'SPECIES')-1)]
tw0 <- list ()
tw0$classif <- as.data.frame (matrix (res, ncol = 4, byrow = T))[,-3]
names (tw0$classif) <- c('order', 'plot.no', 'class')
tw0$twi <- scanned.TWI
class (tw0) <- c('tw0')
setwd (actual.wd)
return (tw0)
}
#' @name twinspan
#' @export
summary.tw <- function (object, ...)
{
if (!object$modif)
{
cat ('Standard TWINSPAN (Hill 1979)', spe = '\n\n')
cat ('Basic setting:\n')
cat (c('Pseudospecies cut levels: ', object$summary$cut.levels, '\n'))
cat (c('Minimum group size: ', object$summary$min.group.size, '\n'))
cat (c('Number of hierarchical levels: ', object$summary$levels, '\n'))
}
if (object$modif)
{
cat ('TWINSPAN (Hill 1979) modified according to Rolecek et al. (2009)', spe = '\n\n')
cat ('Basic setting:\n')
cat (c('Pseudospecies cut levels: ', object$summary$cut.levels, '\n'))
cat (c('Minimum group size: ', object$summary$min.group.size, '\n'))
cat (c('Required number of clusters: ', object$summary$clusters, '\n'))
cat (c('Dissimilarity measure: ', object$summary$diss, '\n'))
cat (c('Mean or median of dissimilarity calculated? ', object$summary$mean.median, '\n'))
cat (c('\nResults of modified TWINSPAN algorithm:\n'))
no_print <- lapply (object$summary$heter, FUN = function (het)
{
for_print <- rbind ('Cluster heterogeneity' = formatC(het$cluster.heter, format = 'f', digits = 3), 'No of samples' = formatC(het$no.samples.per.group))
colnames (for_print) <- paste ('CLUST', het$tw.class.level, sep = ' ')
print.default (for_print, quote = F, print.gap = 2, justify = 'right')
cat (c('Candidate cluster for next division: ', het$which.most.heter, '\n\n'))
})
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.