#' onephase
#'
#' \code{onephase} is used to calculate estimations exclusively based on
#' terrestrial observations of a forest inventory (i.e. the \emph{local densities}).
#' The estimation method is available for simple and cluster-sampling
#' and provides point estimates of the sample mean and their variances.
#'
#' @param formula an object of class "\code{\link[stats]{formula}}" that
#' must be of the form \code{Y ~ 1}, where Y is the terrestrial response
#' value of interest provided in \code{data}.
#'
#' @param data a data frame or vector containing the response value Y.
#' Specifications are given under 'Details'.
#'
#' @param phase_id an object of class "\code{\link[base]{list}}" containing two elements:
#' \itemize{
#' \item \code{phase.col}: the column name in \code{data} that specifies the
#' phase membership of each observation
#' \item \code{terrgrid.id}: the indicator identifying the the terrestrial
#' (a.k.a. "ground truth") phase for that column
#' (must be of type "\code{\link[base]{numeric}}")
#' }
#' \strong{Note:} Only has to be specified if \code{data} is of class \code{data.frame}.
#'
#'
#' @param cluster Specifies the column name in \code{data}
#' containing the cluster identification. Only used in case of
#' cluster sampling.
#'
#' @param area (\emph{Optional}) an object of class "\code{\link[base]{list}}" containing two elements:
#' \itemize{
#' \item \code{sa.col}: the column name in \code{data} containing
#' domain identification
#' \item \code{areas}: vector of desired domains for which the estimation
#' should be computed. If estimations for multiple domains should be computed,
#' the domains have to be defined within a \code{character} vector using \code{c()}
#' }
#'
#' Further details of the parameter-specifications are given under \emph{'Details'}.
#'
#' @details
#'
#' \code{data} can either be a vector only containing the observations of the
#' response variable Y,
#' \emph{or} a data frame containing a column for the response variable and
#' a column for the sample-grid indication that has to be further specified
#' by argument \code{phase_id}.
#' Additional \emph{optional} columns include a cluster identification in case of
#' cluster sampling, as well as a column that specifies a domain (e.g. a forest district)
#' the respective terrestrial observation falls into.
#' The latter allows to compute onephase-estimations
#' for multiple domains at a time (see \emph{'Examples'}).
#'
#'
#' @return \code{onephase} returns an object of class \code{"onephase"}.
#'
#' The functions \code{summary} and \code{confint} can be used to obtain a summary of the
#' estimation results (point estimations, variances and sample sizes) and the confidence intervals
#' for the respective point estimates.
#'
#' An object of class \code{"onephase"} returns a \code{list} of the following components:
#'
#' \item{input}{a \code{list} containing the function inputs}
#' \item{estimation}{a data frame containing the following components:
#' \itemize{
#' \item \code{area:} the domain (only present if argument \code{area} has been used)
#' \item \code{estimate:} the point estimate
#' \item \code{variance:} the variance of the point estimate
#' \item \code{n2:} the terrestrial sample size
#' }}
#' \item{samplesizes}{a named numeric vector giving the terrestrial samplesize}
#'
#' @references Hill, A., Massey, A. F. (2021). \emph{The R Package forestinventory: Design-Based Global and Small Area Estimations for Multiphase Forest Inventories.} Journal of Statistical Software, 97(4), 1-40.
#' @references Mandallaz, D. (2007). \emph{Sampling techniques for forest inventories.} Chapter 4. CRC Press.
#'
#' @example examples/example_onephase_estimations.R
#'
#'
#' @export
onephase <- function(formula, data,
phase_id = list(phase.col=NA,terrgrid.id=NA),
cluster= NA, area= list(sa.col = NA, areas = NA)){
# -------------------------------------------------------------------------- #
# -------------------------------------------------------------------------- #
# initial error-checking of mandatory input parameters:
# ------------------------------------- #
# 1) check if data is of type data.frame or vector:
if (!is.data.frame(data) & !is.vector(data)) { stop("data must be a vector or a data.frame")}
# --------------------------------------------------- #
# 2) check if variables used in formula(s) exist in data:
if (is.data.frame(data)) {
if ( prod( all.vars(formula) %in% colnames(data) ) != 1){
missing.var<- all.vars(formula) [which(all.vars(formula) %in% colnames(data) == FALSE)]
stop(paste("Variable ", missing.var," used in regression-formula does not exist in data", sep = ""))
}
}
# --------------------------------- #
# 3) check for input-type "phase_id":
if (all(!is.na(unlist(phase_id)))){
# check if "phase.col" is character:
if (!is.character(phase_id[["phase.col"]])) {
stop("phase.col argument in input-parameter phase_id must be of type character")
}
# check if "phase.col" - name exists in data:
if(phase_id[["phase.col"]] %in% colnames(data) == FALSE){ stop("specified phase column does not exist in data")}
# check if terrgrid in "phase_id" does not exist in data:
if(!all(phase_id[["terrgrid.id"]] %in% sapply(unique(data[phase_id[["phase.col"]]]), as.character))) {
stop("the specified terrestrial grid indicator does not exist in data")
}
}
# -------------------------------------------------------------------------- #
# -------------------------------------------------------------------------- #
# store fucntion-call:
call<- match.call()
# -------------------------------------------------------------------------- #
# -------------------------------------------------------------------------- #
# create one-phase estimation function as a closure:
onephase_estimation<- function(){
# -----------------------#
if (all(!is.na(unlist(area)))){ # if a list of areas is given ...
sa_col<- area[["sa.col"]]
areas<- area[["areas"]]
# ... shrink dataset to current area:
data<- data[data[[sa_col]] == areas[i], ]
}
# if area is not given, dataset remains as it is ...
# -----------------------#
# extract terrestrial dataset:
# get name of response value:
response<- all.vars(as.formula(formula))
# retrieve phase.columnname and indicator of s1 grid id and terrestrial-grid id:
if (all(!is.na(unlist(phase_id)))){ # if phase_id is specified ...
phase.col<- phase_id[["phase.col"]]
s2.ind<- phase_id[["terrgrid.id"]] # identifies the terrestrial sample (s2-sample)
} else { # else assume that dataset only contains of terrestrial data
# --> introduce an artificial terrestrial ground ID (==1):
data<- data.frame(data, phase.col = rep(1, times= ifelse(is.data.frame(data), nrow(data), length(data))))
if(!(response %in% colnames(data))){colnames(data)[1]<- response}
phase.col<- "phase.col"
s2.ind<- 1
}
# extract terrestrial data:
data.terr<- data[data[[phase.col]] == s2.ind, ]
# -----------------------#
# NA-treatment:
# rows to be deleted due to missing response value:
deleted.s2<- !complete.cases(data.terr [,response]) # logical vector returning rows with missing response
sum.NA_omitted<- sum(deleted.s2)
# delete missing rows in entire dataset and produce message:
if(sum.NA_omitted != 0) {
data.terr<- data.terr[- which(deleted.s2),]
message(paste(sum.NA_omitted," rows deleted due to missingness in the response value",sep = ""))
}
# -----------------------#
# ------ non-cluster -----#
if (is.na(cluster)){
# sample size:
n2<- nrow(data.terr)
# check if any terrestrial data available:
if (n2 == 0){
warning("Estimation not possible, set to 'NA': Domain does not contain any terrestrial data", call. = F)
estimation<- data.frame(estimate=NA, variance=NA, n2=n2)
# summarize sample size info:
samplesizes<- n2
names(samplesizes)<- "n2"
} else {
# estimations:
estimate<- mean(data.terr[[response]])
variance<- (1/n2)*var(data.terr[[response]])
if(n2 < 2){
warning("Variance estimation not possible, set to 'NA': Domain contains only one cluster", call. = F)
variance<- NA
}
# summarize sample size info:
samplesizes<- n2
names(samplesizes)<- "n2"
estimation<- data.frame(estimate=estimate, variance=variance, n2=n2)
}
}
# ------- cluster --------#
if (!is.na(cluster)){
# sample size:
n2<- nrow(data.terr)
if (n2 == 0){
warning("Estimation not possible, set to 'NA': Domain does not contain any terrestrial data", call. = F)
estimation<- data.frame(estimate=NA, variance=NA, n2=n2)
# summarize sample size info:
samplesizes<- data.frame(cbind ( 0, n2))
colnames(samplesizes)<- c("n2_clust", "n2")
rownames(samplesizes)<- "plots"
} else {
# get on cluster level:
cluster_weights<- aggregate(data.terr[[response]], list(cluster = data.terr[,cluster]), length) # the M(x) for sample s2
names(cluster_weights)[2]<- "Mx"
# plot sample size:
n2_clusters<- nrow(cluster_weights)
# local densities on cluster level:
cluster_locdens<- aggregate(data.terr[[response]], list(cluster = data.terr[,cluster]), mean)
names(cluster_locdens)[2]<- response
# merge locdens and Mx:
est.dat<- merge(cluster_locdens, cluster_weights, by=cluster)
# estimations:
estimate<- weighted.mean(est.dat[[response]], w = est.dat[["Mx"]])
variance<- (1/(n2_clusters*(n2_clusters-1))) * sum( ((est.dat[["Mx"]] / mean(est.dat[["Mx"]]))^2) * ((est.dat[[response]] - estimate)^2) )
if(n2_clusters < 2){
warning("Variance estimation not possible, set to 'NA': Domain contains only one cluster", call. = F)
variance<- NA
}
# summarize sample size info:
samplesizes<- data.frame(cbind ( n2_clusters, n2))
colnames(samplesizes)<- c("n2_clust", "n2")
rownames(samplesizes)<- "plots"
estimation<- data.frame(estimate=estimate, variance=variance, n2=samplesizes$n2_clust)
}
}
# ---- create outputs ------#
# ... to store inputs used:
inputs<- list()
inputs[["data"]]<- data
inputs[["method"]]<- "onephase"
inputs[["cluster"]]<- !is.na(cluster)
inputs[["call"]]<- call
result<- list(input=inputs,
estimation=estimation,
samplesizes=samplesizes)
return(result)
} # end of onephase_estimation-function
# -------------------------------------------------------------------------- #
# -------------------------------------------------------------------------- #
# run estimations:
if (all(!is.na(unlist(area)))){ # if a list of areas is given ...
# ------------------------------------ #
# ... to store the estimates:
areas<- area[["areas"]]
no.sa <- length(areas)
results <- data.frame(area=rep(NA_character_, no.sa),
estimate=NA_real_, variance=NA_real_,
n2=NA_integer_)
results[,"area"]<-as.character(results[,"area"])
# ... to store the sample sizes:
samplesizes<- list()
# ------------------------------------ #
# loop over areas:
for (i in 1:no.sa) {
result.temp<- onephase_estimation()
# store estimation results
results[i,names(results)[-1]] <- result.temp$estimation
results[i,names(results)[1]] <- areas[i] # add name of small area
# store samplesize of small area:
samplesizes[[areas[i]]]<- result.temp$samplesizes
}
result.temp[["input"]][["data"]]<- data
result<- list(input=result.temp[["input"]],
estimation=results,
samplesizes=samplesizes)
# ------------------------------------ #
} else { # if no areas are given... --> just make onephase est. for entire dataset
result.temp<- onephase_estimation()
result<- list(input=result.temp[["input"]],
estimation=result.temp$estimation,
samplesizes=result.temp$samplesizes)
}
# -------------------------------------------------------------------------- #
# -------------------------------------------------------------------------- #
class(result)<- "onephase"
return(result)
} # end of one-phase function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.