R/sampsd.R

Defines functions sampsd

Documented in sampsd

#' Sampling Simulated Data and Estimation of Multivariate Standard Errors
#'
#' For each simulated data set, this function performs repeated sampling across a range of effort levels
#' and estimates the corresponding MultSE (pseudo-multivariate standard error) using dissimilarity-based methods.
#'
#' @param dat.sim A list of simulated data sets generated by \code{\link{simdata}}.
#' @param Par A list of parameters estimated by \code{\link{assempar}}.
#' @param transformation Mathematical transformation to reduce the influence of dominant species: one of "square root", "fourth root", "Log (X+1)", "P/A", or "none".
#' @param method Dissimilarity metric to use, passed to \code{\link[vegan]{vegdist}} (e.g., "bray", "jaccard", "gower").
#' @param n Maximum number of sampling units per site (must be <= total units available).
#' @param m Maximum number of sites to sample per data set (must be <= total number of sites).
#' @param k Number of repetitions of each sampling configuration (samples × sites) for each data set.
#'
#' @details
#' For multi-site simulations, the function selects subsets of sites (from 2 to \code{m}) and then draws \code{n} samples per site
#' using a two-stage sampling method with inclusion probabilities (Tillé, 2006). For single-site simulations, repeated samples of size
#' 2 to \code{n} are taken without replacement.
#'
#' Each sample undergoes the selected transformation and a dissimilarity matrix is computed.
#' MultSE is estimated using:
#' \itemize{
#'   \item \emph{Single site:} pseudo-variance, with \eqn{MultSE = \sqrt(V/n)}
#'   \item \emph{Multiple sites:} mean squares from a PERMANOVA model (residual and site effects)
#' }
#' This procedure is computationally intensive, especially with large \code{k}. Start with low values for exploration.
#'
#' @return A matrix containing the estimated MultSE values for each simulated data set, sampling effort combination,
#' and repetition. This matrix is used by \code{\link{summary_ssp}}.
#'
#' @note
#' For quick exploratory analysis, use small \code{k}. Once optimal sampling effort is explored,
#' rerun with larger \code{k} (e.g. 100). Computation time will increase accordingly.
#'
#' @references
#' Anderson, M. J., & Santana-Garcon, J. (2015). Measures of precision for dissimilarity-based multivariate analysis of ecological communities. Ecology Letters, 18(1), 66–73.
#'
#' Guerra-Castro, E. J., Cajas, J. C., Simoes, N., Cruz-Motta, J. J., & Mascaro, M. (2021). SSP: An R package to estimate sampling effort in studies of ecological communities. Ecography, 44(4), 561–573. \doi{10.1111/ecog.05284}
#'
#' Tillé, Y. (2006). Sampling Algorithms. Springer, New York.
#'
#' @seealso \code{\link{assempar}}, \code{\link{simdata}}, \code{\link{summary_ssp}}, \code{\link[vegan]{vegdist}}
#'
#' @examples
#' ## Single site example
#' data(micromollusk)
#' par.mic <- assempar(data = micromollusk, type = "P/A", Sest.method = "average")
#' sim.mic <- simdata(par.mic, cases = 3, N = 20, sites = 1)
#' sam.mic <- sampsd(dat.sim = sim.mic, Par = par.mic, transformation = "P/A",
#'                   method = "jaccard", n = 10, m = 1, k = 3)
#'
#' ## Multiple site example
#' data(sponges)
#' par.spo <- assempar(data = sponges, type = "counts", Sest.method = "average")
#' sim.spo <- simdata(par.spo, cases = 3, N = 20, sites = 3)
#' sam.spo <- sampsd(dat.sim = sim.spo, Par = par.spo, transformation = "square root",
#'                   method = "bray", n = 10, m = 3, k = 3)
#'
#' @importFrom vegan vegdist
#' @importFrom stats model.matrix
#' @importFrom sampling balancedtwostage
#'
#' @export

sampsd <- function(dat.sim, Par, transformation, method, n, m, k) {
    names(dat.sim) <- sprintf("%i", 1:length(dat.sim))

    N<-max(dat.sim[[1]][,'N'])
    sites<-max(as.numeric(dat.sim[[1]][,'sites']))
    if (sites == 1) {
        multi.site = FALSE
    } else {multi.site = TRUE}
    if (n > N){
        stop("'n' must be equal or less than 'N'")}
    if (m > sites){
        stop("'m' must be equal or less than 'sites'")}

    if (multi.site == TRUE) {
        # Function to estimate the multivariate standard errors (multiple sites and sample replicates)
        model.MSE <- function(D, y) {
            D = as.matrix(D)
            N = dim(D)[1]
            g = length(levels(factor(y$sites)))
            X = model.matrix(~factor(y$sites))  #model matrix
            H = X %*% solve(t(X) %*% X) %*% t(X)  #Hat matrix
            I = diag(N)  #Identity matrix
            A = -0.5 * D^2
            G = A - apply(A, 1, mean) %o% rep(1, N) - rep(1, N) %o% apply(A, 2, mean) + mean(A)
            MS1 = sum(G * t(H))/(g - 1)  #Mean square of sites
            MS2 = sum(G * t(I - H))/(N - g)  #Mean square of residuals

            #Components of variation and MultSE for each source of variation
            if (MS1 > MS2) {
                CV1 = (MS1 - MS2)/(N/g)
            } else {
                CV1 = 0
            }
            MSE1 = sqrt(CV1/g)
            MSE2 = sqrt(MS2/(N/g))
            return(c(MSE1, MSE2))
        }

        #Function to sample and estimate MultSE
        samp<-function (X = NULL){

            #Matrix to store the results
            n <- seq(2, n, by = 1)
            m <- seq(2, m, by = 1)
            mse.results <- matrix(nrow = length(n) * length(m) * k, ncol = 6)
            mse.results[, 2] <- base::rep(1:k, times = length(n) * length(m))
            mse.results[, 3] <- base::rep(m, times = length(n), each = k)
            mse.results[, 4] <- base::rep(n, times = 1, each = length(m) * k)
            colnames(mse.results) <- c("dat.sim", "k", "m", "n", "MSE.sites", "MSE.n")

            #Objects needed for loop in samp
            Y <- cbind(1:(N * sites))
            YPU <- as.numeric(as.vector(gl(sites, N)))
            NN<-nrow(mse.results)
            mm<-mse.results[,3]
            nn <- rep(NA, NN)
            for (i in seq_len(NN)){
                nn[i] <- mse.results[i, 3] * mse.results[i, 4]
            }
            Par<-Par
            Sest<-Par[[3]]

            #Sampling and estimation of MultSE for each sampling size
            for (i in seq_len(NN)){
                sel <- balancedtwostage(Y, selection = 1, m = mm[i], n = nn[i], PU = YPU, FALSE)

                #replace incorrect probabilities (p < 0 and p > 1) produced by balancetwostage
                sel[sel[,1]<= -1, 1] <- 0
                sel[sel[,1]>= 2, 1] <- 1

                #getting data
                dat.df<-X
                rownames(sel) <- c(1:nrow(dat.df))
                m<-sel[, 1]
                y <- dat.df[m==1,]
                dat <- y[ , 1:Sest]
                dat$dummy <- 1 #dummy constant
                dat<-as.matrix(dat)

                #Transformation of and estimatation of a dissimilarity matrix "D"

                if (transformation == "square root") {
                    dat.t <- sqrt(dat)
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "fourth root") {
                    dat.t <- sqrt(sqrt(dat))
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "Log (X+1)") {
                    dat.t <- log(dat + 1)
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "P/A") {
                    dat.t <- 1 * (dat > 0)
                    rm(dat)
                    D <- vegdist(dat.t, method = method, binary = TRUE)
                }
                if (transformation == "none") {
                    dat.t <- dat
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }

                #estimation of MultSE for each source of variation
                mse <- tryCatch(model.MSE(D = D, y = y), error = function(e) {return(c(NA, NA))})

                #Store results
                mse.results[i,5] <- mse[1]
                mse.results[i,6] <- mse[2]
            }
            return(mse.results)
        }#end of samp

        #Apply samp to simulated data
        results<-lapply(dat.sim, samp)
        results<-do.call(rbind, results)

        #Identity of each simulated data
        n <- seq(2, n, by = 1)
        m <- seq(2, m, by = 1)
        results[, 1] <- base::rep(1:length(dat.sim), each = length(n) * length(m) * k)

        return(results)

    } # end of loop for multisites

    if (multi.site == FALSE) {

        # Function to estimate the multivariate standard error for a single site
        mSE <- function(D) {
            n = dim(as.matrix(D))
            ss = sum(D^2)/n
            v = ss/(n - 1)
            x = sqrt(v/n)
            return(x[1])
        }

        #Function to sample and estimate MultSE
        samp<-function (X = NULL){

            #Matrix to store the results
            n <- seq(2, n, by = 1)
            mse.results <- matrix(nrow = length(n) * k, ncol = 4)
            mse.results[, 2] <- rep(1:k, times = length(n))
            mse.results[, 3] <- rep(n, times = 1, each = k)
            colnames(mse.results) <- c("dat.sim", "k", "n", "mSE")

            #Objects needed for loop in samp
            NN<-nrow(mse.results)
            Par<-Par
            Sest<-Par[[3]]

            #estimation of MultSE for each sampling size
            for (i in seq_len(NN)) {
                y <- sample(nrow(X), mse.results[i,3])
                dat<-X[y,1:Sest]
                dat$dummy <- 1 #dummy constant
                dat<-as.matrix(dat)

                if (transformation == "square root") {
                    dat.t <- dat^0.5
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "fourth root") {
                    dat.t <- dat^0.25
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "Log (X+1)") {
                    dat.t <- log(dat + 1)
                    rm(dat)
                    D <- vegdist(dat.t, method = method)
                }
                if (transformation == "P/A") {
                    dat.t <- 1 * (dat > 0)
                    rm(dat)
                    D <- vegdist(dat.t, method = method, binary = TRUE)
                }
                if (transformation == "none") {
                    D <- vegdist(dat, method = method)
                }

                #estimation of MultSE
                mse <- tryCatch(mSE(D = D), error = function(e) {
                    return(NA)
                })

                #Store results
                mse.results[i, 4] <- mse
            }
            return(mse.results)
        }

        #Apply samp to simulated data
        results<-lapply(dat.sim, samp)
        results<-do.call(rbind, results)

        #Identity of each simulated data
        n <- seq(2, n, by = 1)
        results[, 1] <- base::rep(1:length(dat.sim), each = length(n) * k)

        return(results)
    } # end of loop for a single site

}

Try the SSP package in your browser

Any scripts or data that you put into this service are public.

SSP documentation built on June 8, 2025, 11:41 a.m.