#' @title Summarize two-stage sample
#' @description Summarizes population-level statistics for
#' two-stage sample data. The calculations are derived from Chapter 3 in
#' Avery and Burkhart's (1967) Forest Measurements, Fifth Edition. The
#' variance terms refer to the variance of the mean.
#' @usage summarize_two_stage(data, plot = TRUE, attribute = NA,
#' populationClusters = 0,
#' populationElementsPerCluster = 0,
#' desiredConfidence = 0.95)
#' @param data data frame containing observations of variable of
#' interest for either cluster-level of plot-level data.
#' @param plot logical TRUE if parameter data is plot-level, FALSE if
#' parameter data is cluster-level. Default is TRUE.
#' @param attribute character name of attribute to be summarized.
#' @param populationClusters numeric total number of clusters in the
#' population.
#' @param populationElementsPerCluster numeric total number of
#' elements in the population.
#' @param desiredConfidence numeric desired confidence level (e.g. 0.9).
#' @return data frame of stand-level statistics including
#' standard error and confidence interval limits. All final values are
#' on a 'per plot' basis.
#' @author Karin Wolken
#' @import dplyr
#' @examples
#' \dontrun{
#'
#' # See Forest Sampling vignette for more details
#'
#'
#' # Data can be input as either clusters or plots.
#'
#'
#' # Cluster level data can be expressed as:
#'
#' dataCluster <- data.frame(
#' clusterID = c(1, 2, 3, 4, 5),
#' totClusterElements = c(5, 2, 1, 6, 5),
#' sampledElements = c(2, 2, 2, 2, 2),
#' isUsed = c(T, T, T, T, F),
#' attrSumCluster = c(
#' 1000, 1250, 950,
#' 900, 1005
#' )
#' )
#' # Example:
#' summarize_two_stage(dataCluster, F, "attr")
#'
#'
#' # Plot level data can be expressed as:
#'
#' dataPlot <- data.framedata.frame(
#' clusterID = c(
#' 1, 1, 1, 2, 2, 2,
#' 3, 3, 3, 4, 4, 4,
#' 5, 5, 5, 6, 6, 6
#' ),
#' volume = c(
#' 500, 650, 610, 490,
#' 475, 505, 940, 825,
#' 915, 210, 185, 170,
#' 450, 300, 500, 960,
#' 975, 890
#' ),
#' isUsed = c(
#' T, T, T, T, T, T, T,
#' T, T, T, T, T, T, T,
#' T, T, T, T
#' )
#' )
#' # Example:
#' summarize_two_stage(redData, T, "volume",
#' populationClusters = 16,
#' populationElementsPerCluster = 160
#' )
#' }
#' @export
summarize_two_stage <- function(data, plot = TRUE, attribute = NA,
populationClusters = 0,
populationElementsPerCluster = 0,
desiredConfidence = 0.95) {
if (!is.na(attribute) && (attribute %in% colnames(data))) {
attrTemp <- unlist(data %>% dplyr::select(one_of(attribute)))
if (plot) {
data$attr <- attrTemp
} else {
data$attrSumCluster <- attrTemp
}
}
if (plot) {
# calculates cluster values from plot data
temp <- data %>%
mutate(attr = ifelse(is.na(attr), 0, attr)) %>%
mutate(attrSq = attr^2)
# sum attributes by cluster
attrSum <- aggregate(
temp$attr,
by = list(clusterID = temp$clusterID), FUN = sum
)
attrSqSum <- aggregate(
temp$attrSq,
by = list(clusterID = temp$clusterID), FUN = sum
)
# checks if any secondary is used in any primary/cluster
clusterUsed <- aggregate(
data$isUsed,
by = list(clusterID = data$clusterID), FUN = sum
)
clusterUsed$x <- ifelse(clusterUsed$x > 0, T, F) # converts above to T/F
totalElementsInCluster <- count(data, clusterID) # tally of all elements in each cluster
sampledElementsInCluster <- count(data[data$isUsed, ], clusterID) # tally of elements sampled in each cluster
# attach the sum of attributes and tally of elements to unique cluster
# produce table with key values
cluster <- full_join(clusterUsed, attrSum, by = "clusterID") %>%
full_join(totalElementsInCluster, by = "clusterID") %>%
rename(totClusterElements = n, isUsed = x.x, attrSumCluster = x.y) %>%
full_join(sampledElementsInCluster, by = "clusterID") %>%
select(
clusterID,
totClusterElements,
sampledElements = n,
isUsed,
attrSumCluster
) %>%
mutate(
sampledElements = ifelse(is.na(sampledElements), 0, sampledElements)
) %>%
mutate(attrSqSumCluster = attrSqSum$x)
} else {
# reassigns data as cluster, if input data is cluster-level data
# data must include unique ID ('clusterID'), number of cluster elements
# in each cluster ('totClusterElements'), number of cluster elements
# sampled in each cluster ('sampledElements'), logical true if the
# cluster has any sampled elements ('isUsed'), sum of attributes in
# each cluster ('attrSumCluster')
cluster <- data %>%
mutate(attrSqSumCluster = attrSumCluster^2)
}
if (as.integer(anyDuplicated(cluster$clusterID)) == 1) {
stop("Data cannot have repeated clusterID.")
}
if (length(cluster$clusterID) == 1) {
stop("Must have multiple clusters. Consider other analysis.")
}
n <- sum(cluster$isUsed) # number of sampled clusters
m <- sum(cluster$sampledElements) / n # average number of sampled plots among sampled clusters
EN <- ifelse( # total number of clusters
populationClusters != 0,
populationClusters,
length(cluster$isUsed)
)
EM <- ifelse( # total number of total plots among all clusters
populationElementsPerCluster != 0,
populationElementsPerCluster,
sum(cluster$totClusterElements)
)
finalCalc <- cluster %>%
summarize(
# yBar denominator written for clarity: average m per cluster * n
yBar = sum(attrSumCluster) / (m * n),
s2b = ((sum(attrSumCluster^2) / (m)) -
(sum(attrSumCluster)^2 / (m * n))) / (n - 1),
s2w = (sum(attrSqSumCluster) - sum(attrSumCluster^2) / (m)) /
(n * (m - 1)),
df = sum(sampledElements)
) %>%
mutate(
ySE = ifelse(
length(unique(cluster$totClusterElements)) == 1,
# if clusters are equal in size, num of elements per cluster is equal:
sqrt(
(1 / (m * n)) * ((s2b * (1 - n / EN) +
n * s2w / EN * (1 - m / EM))[[1]])
),
ifelse(n / EN < 0.2,
# if n is a small fraction of N, SE is simplified to:
sqrt(s2b^2 / (m * n)),
ifelse(m / EM < 0.2,
# when n/N is fairly large, but num of secondaries (m)
# sampled in each selected primary is only a fraction
# of the total secondaries (M) in each primary (n)
sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN)),
# SE calculated below is approximated based on available
# methods and may not be accurate
# These statements are intended to catch all other samples
if (n / EN < m / EM) {
sqrt(s2b^2 / (m * n))
} else {
sqrt(1 / (m * n) * (s2b * (1 - n / EN) + (n * s2w) / EN))
}
)
)
)
) %>%
mutate(
tScore = qt(1 - ((1 - desiredConfidence) / 2), df),
highCL = yBar + tScore * ySE,
lowCL = yBar - tScore * ySE
) %>%
select(-tScore)
clusterSummary <- finalCalc %>%
summarize(
mean = yBar,
varianceB = s2b,
varianceW = s2w,
standardError = ySE,
upperLimitCI = highCL,
lowerLimitCI = lowCL
)
# return data frame of stand-level statistics
return(clusterSummary)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.