R/ustar_threshold.R

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+++ R script with methods for ustar filtering +++
#+++ Ustar filtering adapted after the idea in Papale, D. et al. (2006) +++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#' =============================================================================
#' Set Ustar Processing Scenarios
#' =============================================================================
#'
#' Original name: sEddyProc_sSetUstarScenarios
#' @author
#'
#' @param ustar_thr Data frame returned by \code{\link{usGetAnnualSeasonUStarMap}}
#'   or \code{\link{usGetSeasonalSeasonUStarMap}}. First column, season names,
#'   and remaining columns different estimates of ustar threshold. If
#'   \code{ustar_thr} has only one row, then each ustar threshold estimate is
#'   applied to the entire dataset. Entries in first column must match levels in
#'   argument \code{sf} of
#'   \code{\link{sEddyProc_sEstUstarThresholdDistribution}}.
#' @param uStarSuffixes Suffixes appended to result column names; by default the
#'   column names of ustar_thr unless its first season column.
#'
#' @details
#' The eddy covariance method does not work with low turbulence conditions, and
#' periods with low turbulence indicated by low ustar must be filtered out and
#' gap-filled (see \code{\link{sEddyProc_sMDSGapFill}}). The threshold value of
#' a sufficient ustar causes one of the largest uncertainty components within
#' the gap-filled data. Hence, it is good practice to compare derived quantities
#' based on gap-filled data using different ustar threshold values.
#'
#' For example a user could provide the the following columns in argument
#' \code{uStarTh}
#' \itemize{
#' \item season: identifier for which season this row is used.
#' \item Ustar: estimate on original series
#' \item U05: 5% of bootstrap
#' \item U50: median of bootstrap
#' \item U95: 95% of bootstrap)
#' }
#' Usually, Scenarios are retrieved from resuls of estimating the distribution
#' by \code{\link{sEddyProc_sEstUstarThresholdDistribution}} and then taking
#' either
#' \itemize{
#' \item annually aggregated value for all seasons
#'   (\code{\link{usGetAnnualSeasonUStarMap}}) or
#' \item seasonal estimates (\code{\link{usGetSeasonalSeasonUStarMap}}).
#' }
#'
#' The following functions apply a processing step to all of the scenarios.
#' \itemize{
#' \item \code{\link{sEddyProc_sMDSGapFillUStarScens}}: gap-filling
#' \item \code{\link{sEddyProc_sMRFluxPartitionUStarScens}}: flux-partitioning
#' }
#' The generated output columns are distinguished by appending a suffix to the
#' respective column name. Then the spread across those columns is an estimate
#' of the uncertainty introduced by not knowing the exact value of the u*
#' threshold.
#'
#' @export
sEddyProc_sSetUstarScenarios <-
  function(ustar_thr, uStarSuffixes = colnames(ustar_thr)[-1]) {
    if (!("season" %in% colnames(sTEMP)) ) {
      stop("Seasons not defined yet. Add column 'season' to dataset with entries
           matching column season in UstarThres.df, e.g. by calling
           yourEddyProcClass$sSetUStarSeasons(...)")
    }
    if (!all(is.finite(as.matrix(ustar_thr[, -1])))) {
      warning("Provided non-finite uStarThreshold for some periods. All values in
              corresponding period will be marked as gap.")
    }
    if (nrow(ustar_thr) == 1L) {
      ustar_thr <- cbind(data.frame(
        season = levels(.self$sTEMP$season)), ustar_thr[, -1], row.names = NULL)
    }
    missing <- which( !(levels(.self$sTEMP$season) %in% ustar_thr[[1]]))
    if (length(missing)) {
      stop("Need to provide uStar threshold for all seasons, but was missing for
           seasons", paste(levels(.self$sTEMP$season)[missing], collapse = ","))
    }
    nEstimates <- ncol(ustar_thr) - 1L
    uStarSuffixes <- unique(uStarSuffixes)
    if (length(uStarSuffixes) != nEstimates) {
      stop("Number of unique suffixes must correspond to number of ustar
           thresholds, i.e. number of columns in ustar_thr - 1.")
    }
    sUSTAR_SCEN <- ustar_thr
    colnames(sUSTAR_SCEN)[-1] <- uStarSuffixes
  }

#' =============================================================================
#' Estimate ustar threshold (call)
#' =============================================================================
#'
#' Original name: sEddyProc_sEstUstarThold
#' @author TW
#'
#' @description Calling \code{\link{est_ustar_thr}} for class data and
#'   storing results.
#'
#' @param data A data frame.
#' @param ustar Column name for ustar.
#' @param NEE Column name for NEE.
#' @param Tair Column name for air temperature.
#' @param Rg Column name for solar radiation.
#' @param ... Further arguments to \code{\link{est_ustar_thr}}.
#' @param sf Factor of seasons to split.
#'
#' @return Result component \code{uStarTh} of \code{\link{est_ustar_thr}}. In
#'   addition the result is stored in class variable \code{sUSTAR_DETAILS}.
#'
#' @export
get_ustar_thr <- function(temp, ustar = "ustar", NEE = "NEE",
                          Tair = "Tair", Rg = "Rg", ...,
                          sf = create_sf(data$timestamp)) {
  if (length(sf)) temp$season <- sf
  if (is.null(temp$season)) {
    stop("Ustar seasons must be set by argument 'sf'.", call. = FALSE)
  }
  req <- c("timestamp", ustar, NEE, Tair, Rg)
  missing <- which(!(req %in% names(.self$sDATA)))
  if (length(missing)) {
    stop("Missing columns in dataset: ", paste(req[missing], collapse = ","))
  }
  data <- .self$sDATA[, req, drop = FALSE]
  colnames(data) <- c("timestamp", "ustar", "NEE", "Tair", "Rg")
  resEst <- est_ustar_thr(data, sf = .self$sTEMP$season, ...)
  sUSTAR_DETAILS <- resEst
  return(resEst$uStar)
}

## =============================================================================
#' Estimate ustar threshold
#'
#' Original name: usEstUstarThreshold
#' @author TW, OM
#'
#' @description Estimate the ustar threshold by aggregating the estimates for
#'   seasonal and temperature subsets.
#'
#' @param data Data frame with columns "timestamp", "ustar", "NEE", "Tair", and
#'   "Rg".
#' @param sf Factor for subsetting times (see details).
#' @param ctrl_est Control parameters for estimating ustar on a single
#'   binned series, see \code{\link{control_ustar}}.
#' @param ctrl_sub Control parameters for subsetting time series (number of
#'   temperature and ustar classes), see \code{\link{subset_ustar}}.
#' @param binned_fun Function to estimate ustar on a single binned
#'   series, see \code{\link{est_ustar_fw2}}.
#' @param cleaned Set to TRUE if the data was cleaned already (avoids
#'   expensive call to \code{get_ustar_indices}.
#'
#' @details
#' The threshold for sufficiently turbulent conditions u* (ustar) is estimated
#' for different subsets of the time series. From the estimates for each season
#' (each value in \code{sf}) the maximum of all seasons of one year is reported
#' as estimate for this year. Within each season the time series is split by
#' temperature classes. Among these ustar estimates, the median is reported as
#' season value.
#'
#' In order to split the seasons, a vector with argument \code{sf} must be
#' provided. All positions with the same factor belong to the same season. It is
#' generated by one of the methods in the function \code{\link{create_sf}}:
#' * \code{create_sf(type = "month")} (default DJF-MAM-JJA-SON with December
#'   from previous to January of the year)
#' * \code{create_sf(type = "ymonth")} (default DJF-MAM-JJA-SON with December
#'   from the same year)
#' * \code{create_sf(type = "yday")} for a refined specification of season
#'   starts.
#' * \code{create_sf(type = "year")} for specifying different seasons between
#'   years.
#'
#' The estimation of ustar on a single binned series can be selected argument
#' \code{est_ustar_binned}:
#' * \code{\link{est_ustar_fw1}}
#' * \code{\link{est_ustar_fw2}} (default)
#'
#' This function is called by:
#' \itemize{
#' \item \code{\link{get_ustar_thr}} which stores the result in the
#'   class variables (sUSTAR and sDATA).
#' \item \code{\link{sEddyProc_sEstUstarThresholdDistribution}} which
#'   additionally estimates median and confidence intervals for each year by
#'   bootstrapping the original data within seasons.
#' }
#'
#' Inspecting the NEE~uStar relationship plotting is provided by
#' \code{\link{sEddyProc_sPlotNEEVersusUStarForSeason}}
#'
#' Add index columns to locate which season / tempClass / uStarBin each record
#' belongs. Cannot directly change sDATA, in EddyProcC, because will be
#' overwritten in each bootstrap.
#'
#' \describe{
#' \item{change point detection (CPT) method}{
#' With specifying \code{ctrl_est = control_ustar(diff_temp = TRUE)} change
#' point detection is applied instead of the moving point test (e.g. with
#' Fw2Binned). The sometimes sensitive binning of uStar values within a
#' temperature class is avoided. Further, possible spurious thresholds are
#' avoided by testing that the model with a threshold fits the data better than
#' a model without a threshold using a likelihood ratio test. In addition, with
#' CPT seasons are excluded where a threshold was detected in only less than
#' ctrl_est$min_prop (default 20%) of the temperature classes. Note that this
#' method often gives higher estimates of the ustar threshold.}
#' }
#'
#' \describe{
#' \item{One-big-season fallback}{
#' If there are too few records within one year, or when no season yielded a
#' finite ustar threshold estimate, then the yearly ustar threshold is estimated
#' by pooling the data from seasons within one seasonYear. The user can suppress
#' using pooled data on few records by providing option
#' \code{ctrl_sub$one_season = FALSE} (see \code{\link{subset_ustar}})}
#' }
#'
#' @return
#' A list with entries:
#' * ustar_thr A data.frame with columns "agg_mod", "seasonYear", "season",
#'   "ustar" and rows for "single" (the entire aggregate median across years),
#'   "seasonYear" (each year maximum across seasons or estimate on pooled data),
#'   and "season" (each season median across temperature classes).
#' * seasonYear A data.frame listing results for year with columns "seasonYear",
#'   "uStarMaxSeason" (the maximum across seasonal estimates within the year),
#'   "ustar_pooled" (the estimate based on data pooled across the year - only
#'   calculated on few valid records or on uStarMaxSeason was nonfinite), "recs"
#'   (number of valid records - only if the pooled estimate was calculated),
#'   "ustar_agg" (chosen estimate corresponding to ustar_pooled if it was
#'   calculated, or uStarMaxSeason or ustar_thr across years if the former was
#'   non-finite).
#' * season A data.frame listing results for each season:
#'   * recs The number of valid records.
#'   * uStarSeasonEst The estimate for based on data within the season (median
#'     across temperature classes).
#'   * ustar_agg The chosen estimate, corresponding to uStarSeasonEst or the
#'     yearly seasonYear$ustar_agg, if the former was non-finite.
#' * tempInSeason A numeric matrix (nTemp x nAggSeason) containing estimates
#'   for each temperature subset for each season.
#' * bins Columns \code{season}, \code{temp_bin} and \code{ustar_bin} for each
#'   record of input \code{ds} reporting classes of similar environmental
#'   conditions that the record belongs to.
#' }
#'
#' @references
#' Papale, D. et al. (2006) Towards a standardized processing of net ecosystem
#' exchange measured with eddy covariance technique: algorithms and uncertainty
#' estimation. Biogeosciences 3(4): 571-583.
#'
#' @export
est_ustar_thr <- function(data, sf = create_sf(data$timestamp), sf_yr,
                          ctrl_est = control_ustar(), ctrl_sub = subset_ustar(),
                          binned_fun = est_ustar_fw2, cleaned = FALSE) {
	if (any(is.na(sf))) {
	  stop("Encountered NA in season factor. Need valid season for each record.",
	       call. = FALSE)
	}
  # For each sf level get the year (aggregation period) of the season.
  timestamp <- data$timestamp
  origin <- as.POSIXct("1970-01-01 00:00.00", tz = "UTC")
  tz <- attr(timestamp[1], "tzone")
  res <- tapply(timestamp, sf, fun = function(dates) {
    x <- as.numeric(dates)
    center <- x[1] + (x[length(x)] - x[1]) / 2
    1900L + as.POSIXlt(center, origin = origin, tz = tz)$year
  })
  sf_yr <- structure(as.vector(res), names = rownames(res)) # Convert to vector
  data$season <- as.factor(sf)
	data$sn_yr <- sf_yr[sf]
	data$t_bin <- NA
	data$ustar_bin <- NA
	# Extract valid nighttime records
	if (cleaned) {
		valid <- TRUE
		dsc <- data
	} else {
	  valid <-
	    is.finite(data[, "NEE"]) &
	    is.finite(data[, "Tair"]) &
	    is.finite(data[, "ustar"]) &
	    is.finite(data[, "Rg"])
	  valid <- valid & data[, "Rg"] < ctrl_sub$night_thr
		dsc <- data[valid, , drop = FALSE]
	}
	if (nrow(dsc) == 0L) stop("No finite records in dataset.", call. = FALSE)
	tdsc <- as.data.frame(table(dsc$season))
	colnames(tdsc) <- c("season", "recs")
	# Some seasons might be absent in dsc from cleaning, construct vectors that
	# report NA for missing seasons
	n_sn <- merge(data.frame(season = levels(ds$season)), tdsc, all.x = TRUE)
	n_sn_yr <- merge(n_sn, data.frame(season = names(sf_yr), sn_yr = sf_yr),
	                 all.x = TRUE)
	n_yr <-
	  n_sn_yr %>%
	  group_by(!!sym("seasyr")) %>%
	  summarize(n_rec = sum(!!sym("recs"), na.rm = TRUE))
	seasonYearsWithFewData <- n_yr$sn_yr[n_yr$n_rec < ctrl_sub$min_recs]
	# Use CPT if requested
	est_ustar_season <- function(...) {
		if (isTRUE(ctrl_sub$diff_temp)) {
			.estimateUStarSeasonCPTSeveralT(...)
		} else est_ustar_season(...)
	}
	ustar_sn_tl <-
	  dsc %>%
	  split(.$season) %>%
	  map(est_ustar_season, ctrl_sub = ctrl_sub,
	      ctrl_est = ctrl_est,
	      binned_fun = binned_fun)
	ustar_sn_t <- do.call(rbind, map(ustar_sn_tl, 1L))
	ustar_sn <- apply(ustar_sn_t, 1, median, na.rm = TRUE)
	# Report NA where threshold was found in less than 20% of temperature classes
	invalid <- (rowSums(is.finite(ustar_sn_t)) / ncol(ustar_sn_t)) <
	  ctrl_est$min_prop
	ustar_sn[invalid] <- NA
	# extract the temperature and bin indices
	for (season in names(ustar_sn_tl) ) {
	  bins <- c("t_bin", "ustar_bin")
		dsc[dsc$season == season, bins] <- ustar_sn_tl[[season]]$bins
	}
	# check correct ordering
	results_sn <- n_sn_yr
	results_sn$uStarSeasonEst <- ustar_sn
	results <-
	  results_sn %>%
	  group_by(!!sym("sn_yr")) %>%
	  summarize(uStarMaxSeason = {
	    if (all(!is.finite(!!sym("uStarSeasonEst")))) {NA}
	    else {max(!!sym("uStarSeasonEst"), na.rm = TRUE)}},
	    n_rec = sum(!!sym("n_rec")))
	results$ustar_agg <- results$uStarMaxSeason
	#---- for season_years with too few records and for season_years with no
	# seasonal estimate do a pooled estimate
	sn_yr_pooled <- results$sn_yr[!is.finite(results$ustar_agg) & results$n_rec > 0]
	if (isTRUE(ctrl_sub$one_season)) {
	  sn_yr_pooled <- union(seasonYearsWithFewData, sn_yr_pooled)
	}
	results_pooled <-
	  if (!length(sn_yr_pooled) ) {
	    results_pooled <- data.frame(sn_yr = NA, recs = NA,
	                                 ustar_pooled = NA)[FALSE, ]
	    } else {
				pooled <- filter(dsc, UQ(sym("seasyr")) %in% !!sn_yr_pooled)
				ustar_yrtl <-
				  tmp <-
				  pooled %>%
				  split(.$seasyr) %>%
				  map(fEstimateUStarSeason, ctrl_sub = ctrl_sub,
				      ctrl_est = ctrl_est,
				      binned_fun = binned_fun)
				# matrix (nSeason x nTemp)
				ustar_yrt <-  do.call(rbind, map(ustar_yrtl, 1L))
				ustar_yr <- apply(ustar_yrt, 1, median, na.rm = TRUE)
				# Report NA where threshold was found in less than 20% of temperature
				# classes
				invalid <- (rowSums(is.finite(ustar_yrt)) / ncol(ustar_yrt)) <
				  ctrl_est$min_prop
				ustar_yr[invalid] <- NA
				# Omit gettting binning into ds (do not overwrite bins from seasonal
				# estimates)
				results_pooled <- data.frame(seasyr = names(ustar_yr),
				                             recs = as.vector(table(pooled$seasyr)),
				                             ustar_pooled = ustar_yr)
			}
	results <- merge(results, results_pooled, all.x = TRUE)
	finite_pooled <- is.finite(results$ustar_pooled)
	results$ustar_agg[finite_pooled] <- results$ustar_pooled[finite_pooled]
	#----- overall is the median across years
	ustar_medyr <- median(results$ustar_agg, na.rm = TRUE)
	message(paste("Estimated ustar threshold of: ", signif(ustar_medyr, 2),
	              "by using controls:\n",
	              paste(capture.output(unlist(ctrl_sub)), collapse = "\n")))
	#----- propagate aggregate estimates back to NA-slots of years and seaons
	non_finite <- !is.finite(results$ustar_agg)
	results$ustar_agg[non_finite] <- ustar_medyr
	# Merge yearly aggregated estimates to season and replace by finite
	# seasonal estimates
	results_seas <- merge(results_seas, results[, c("seasyr", "ustar_agg")],
	                      all.x = TRUE)
	finite_est <- (is.finite(results_seas$uStarSeasonEst))
	results_seas$ustar_agg[finite_est] <- results_seas$uStarSeasonEst[finite_est]
	df_out <- results_seas[, c("season", "seasyr", "ustar_agg")]
	df_out$season <- as.factor(df_out$season)
	df_out$agg_mod <- "season"
	tmp <- rbind(cbind(data.frame(agg_mod = "year", season = as.factor(NA)),
	                   results[, c("seasyr", "ustar_agg")]), df_out)
	df_out <- tmp
	tmp <- rbind(cbind(data.frame(agg_mod = "single", season = as.factor(NA),
	                              seasyr = NA), ustar_agg = ustar_medyr), df_out)
	df_out <- tmp
	df_out$ustar <- df_out$ustar_agg
	# Store indices in data, first remove columns
	data[valid, ] <- dsc
	# Check correct ordering, return list with entries
	res <- list(ustar_thr = df_out[, c("agg_mod", "seasyr", "season", "ustar")],
	            seasyr = results, season = results_seas,
	            tempInSeason = t(ustar_seast),
	            bins = data[, c("season", "t_bin", "ustar_bin")])
	return(res)
}

#' =============================================================================
#' Plot NEE versus Ustar for Data of a Subset with Estimates
#' =============================================================================
#'
#' Original name: .plotNEEVersusUStarTempClass
#' @author TW
#'
#' @param data Data frame or tibble with columns NEE, ustar, Tair/Tsoil,
#'   ustar_bin, and timestamp.
#' @param ustar_thr Value of ustar of an estimated threshold.
#' @param ustar Column name for ustar.
#' @param NEE Column name for NEE
#' @param Tair Column name for temperature variable.
#' @param xlab Custom x-axis label. If NULL (default), the x-axis is given the
#'   label \code{bquote(u["*"] * " (" * m ~ s^-1 * ")")}.
#' @param ylab Custom y-axis label. If NULL (default), the y-axis is given the
#'   label \code{bquote("NEE (" * gC ~ m^-2 ~ yr^-1 * ")")}.
#'
#' @details
#' For each ustar_bin, mean of NEE and ustar is calculated.
#'
#' @return Side effect of plotting NEE ~ ustar with indicating means of the
#'   bins, ustar_thr, date range, and temperature range.
#'
plot_ustar_thr <- function(data, ustar_thr, ustar = "ustar", NEE = "NEE",
                           Tair = "Tair", xlab = NULL, ylab = NULL) {
  dss <- data[, c(NEE, ustar, Tair, "ustar_bin", "timestamp")]
  colnames(dss) <- c("NEE", "ustar", "Tair", "ustar_bin", "timestamp")
  dssm <-
    dss %>%
    group_by(!!sym("ustar_bin")) %>%
    summarize(
      ustar_avg = mean(!!sym("ustar")),
      NEE_avg = mean(!!sym("NEE"))
    )
  if (is.null(xlab)) {
    xlab <- bquote(u["*"] * " (" * m ~ s^-1 * ")")
  } else {
    xlab <- bquote(xlab)
  }
  if (is.null(ylab)) {
    ylab <- bquote("NEE (" * gC ~ m^-2 ~ yr^-1 * ")")
  } else {
    ylab <- bquote(ylab)
  }
  plot(
    NEE ~ ustar, dss,
    col = adjustcolor("brown", alpha.f = 0.3),
    ylim = quantile(dss$NEE, c(0.02, 0.98)), xlab = xlab, ylab = ylab
  )
  points(NEE_avg ~ ustar_avg, dssm, pch = "+", cex = 1.5)
  abline(v = ustar_thr, lty = "dashed", col = "darkgrey", lwd = 2)
  date_range <- strftime(range(dss$timestamp), "%d.%m.%y")
  legend("topleft", legend = c(
    paste(
      data$season[1], " (", date_range[1],
      "-", date_range[2], ")",
      sep = ""
    ),
    sprintf(
      " (%1.1f-%1.1f\u00B0C)", min(dss$Tair),
      max(dss$Tair)
    ),
    sprintf("ustar_thr =%1.2f", ustar_thr)
  ))
}

#' =============================================================================
#' Estimate uStar threshold for data of a given season
#' =============================================================================
#'
#' Original name: .estimateUStarSeason
#' @author TW
#'
#' @param dsi A data frame with columns ustar, NEE, Tair, and Rg.
#' @param ctrl_sub
#' @param ctrl_est
#' @param est_ustar_binned
#'
#' @return
#' List with entries:
#' * ustar_thr A vector of ustar threshold for temperature classes.
#' * bins A data frame with columns temp_bin, ustar_bin for each row in dsi.
#'
est_ustar_season <- function(dsi, ctrl_sub, ctrl_est, est_ustar_binned) {
  res_na <- list(
    ustar_thr = rep(NA, ctrl_sub$temp_classes),
    bins = data.frame(
      temp_bin = rep(NA, nrow(dsi)),
      ustar_bin = rep(NA, nrow(dsi))
    )
  )
  if (nrow(dsi) < ctrl_sub$min_recs_season) {
    warning(
      "Too few finite records within season (n = ", nrow(dsi), ").",
      "Need at least n = ", ctrl_sub$min_recs_season,
      ". Returning NA for this season.",
      call. = FALSE
    )
    return(res_na)
  }
  if (nrow(dsi) / ctrl_sub$temp_classes < ctrl_sub$min_recs_temp) {
    warning(
      "Too few finite records within season (n = ", nrow(dsi), ")",
      "for ", ctrl_sub$temp_classes, " temperature classes.",
      " Need at least n = ", ctrl_sub$min_recs_temp * ctrl_sub$temp_classes,
      ". Returning NA for this season.",
      call. = FALSE
    )
    return(res_na)
  }
  order_temp <- order(dsi[["Tair"]])
  # Default for methods that do not bin ustar
  ustar_sorted <- integer(nrow(dsi))
  # Sort values in a season by air temperature (later in class by ustar)
  sort <- dsi[order_temp, , drop = FALSE]
  # Set up vector that contains ustar values for temperature classes
  ustar_thr <- vector(length = ctrl_sub$temp_classes)
  TId <- bin_balanced(sort[, "Tair"], ctrl_sub$temp_classes)
  # Loop through k temperature classes
  for (k in 1:ctrl_sub$temp_classes) {
    temp_class <- TId == k
    sort_temp <- sort[temp_class, ]
    # Threshold only accepted if Tair and ustar are not correlated
    cor1 <- suppressWarnings(abs(cor(
      sort_temp[["ustar"]],
      sort_temp[["Tair"]]
    )))
    if ((is.finite(cor1)) && (cor1 < ctrl_est$corr_thr)) {
      if (isTRUE(ctrl_est$cpt)) {
        if (!requireNamespace("segmented")) {
          stop("Need to install package segmented before using Change Point
				       Decetion for estimation of ustar threshold.", call. = FALSE)
        }
        res_cpt <- try(.fitSeg1(
          sort_temp[["ustar"]],
          sort_temp[["NEE"]]
        ), silent = TRUE)
        ustar_thr[k] <- if (
          inherits(res_cpt, "try-error") ||
            !is.finite(res_cpt["p"]) ||
            res_cpt["p"] > 0.05
        ) {
          NA
        } else {
          res_cpt["cp"]
        }
      } else {
        res_bin <- bin_ustar(
          sort_temp[["NEE"]], sort_temp[["ustar"]],
          ctrl_sub$ustar_classes
        )
        binned_ustar <- res_bin$bin_avgs
        ustar_sorted[temp_class] <- res_bin$ustar_bins
        if (any(!is.finite(binned_ustar[[2]]))) {
          stop("Encountered non-finite average NEE for a ustar bin. Data must be
					     provided with non-finite columns ustar and NEE for ustar
					     threshold detection.", call. = FALSE)
        }
        ustar_thr[k] <- if (binned_ustar[[1]][1] > ctrl_est$check_first) {
          ## If first mean ustar bin is already large (>ctrl_est$check_first),
          ## this temperature class is skipped from estimation
          NA
        } else {
          est_ustar_binned(binned_ustar, ctrl_est = ctrl_est)
        }
      }
    } else {
      # Correlation between T and ustar too high: fill respective cell with NA
      ustar_thr[k] <- NA
    }
  }
  # Bins of temperature and ustar have been generated on sorted data.frame.
  # Need to assign to original positions
  ustar_unsorted <- integer(length(order_temp))
  TIdUnsorted <- ustar_unsorted
  TIdUnsorted[order_temp] <- TId
  ustar_unsorted[order_temp] <- ustar_sorted
  invisible(list(
    ustar_thr = ustar_thr,
    bins = data.frame(
      temp_bin = TIdUnsorted,
      ustar_bin = ustar_unsorted
    )
  ))
}


## =============================================================================
#' Create List of Parameters for Determining Ustar of Single Binned Series
#'
#' Original name: usControlUstarEst
#' Author: TW
#'
#' @param plateau_fwd Number of subsequent ustar bin values to compare to in
#'   fwd mode.
#' @param plateau_back Number of subsequent ustar bin values to compare to in
#'   back mode.
#' @param plateau_crit Significant differences between a ustar value and the
#'   mean of a "plateau".
#' @param corr_thr Threshold value for correlation between Tair and ustar data.
#' @param check_first If first ustar bin average of a class is already
#'   larger than this value, the temperature class is skipped.
#' @param omit_bins If TRUE (default), bins where no threshold was found are
#'   ignored. Set to FALSE to report highest ustar bin for these cases.
#' @param diff_temp Set to TRUE to use Change Point Detection without
#'   binning ustar but with additionally changed aggregation scheme for several
#'   temperature classifications.
#' @param cpt Set to TRUE to use Change Point Detection without binning
#'   ustar before in usual aggregation method (good for comparing methods, but
#'   not recommended, overruled by diff_temp = TRUE).
#' @param min_prop Seasons in which only less than this
#'   proportion of temperature classes a threshold was detected, are excluded
#'   from aggregation.
#' @param min_boot Minimum proportion of bootstrap samples for which a
#'   threshold was detected. Below this proportion NA quantiles are reported.
#' @param min_recs Minimum number of records in plateau, threshold must
#'   be larger than mean of this many bins.
#'
#' @export
control_ustar <- function(plateau_fwd = 10, plateau_back = 6,
                          plateau_crit = 0.95, corr_thr = 0.5,
                          check_first = 0.2, omit_bins = TRUE,
                          diff_temp = FALSE, cpt = FALSE, min_prop = 0.2,
                          min_boot = 0.4, min_recs = 3L) {
  ctrl <- list(
    plateau_fwd = plateau_fwd, plateau_back = plateau_back,
    plateau_crit = plateau_crit, corr_thr = corr_thr, check_first = check_first,
    omit_bins = omit_bins, cpt = cpt, diff_temp = diff_temp,
    min_prop = min_prop, min_boot = min_boot, min_recs = min_recs
  )
  # display warning message for variables that we advise not to be changed
  if (corr_thr != 0.5) {
    warning("Parameter 'corr_thr' is set to a non-default value.",
            call. = FALSE)
  }
  return(ctrl)
}

#' =============================================================================
#' List of parameters for subsetting data for ustar threshold estimation
#' =============================================================================
#'
#' Original name: usControlUstarSubsetting
#' @author TW
#'
#' @param temp_classes number of Tair classes
#' @param ustar_classes number of ustar classes
#' @param night_thr nighttime data threshold for solar radiation (in W+1m-2)
#' @param min_recs_temp minimum number of records within one temperature
#'   class
#' @param min_recs_season minimum number of records within one season
#' @param min_recs_year minimum number of records within one year
#' @param one_season logical, set to FALSE to avoid aggregating
#'   all seasons on too few records
#'
#'  @export
subset_ustar <- function(temp_classes = 7, ustar_classes = 20, night_thr = 10,
                         min_recs_temp = 100, min_recs_season = 160,
                         min_recs_year	= 3000, one_season = TRUE) {
	ctrl <- list(
	  temp_classes = temp_classes, ustar_classes = ustar_classes,
	  night_thr = night_thr, min_recs_temp = min_recs_temp,
	  min_recs_season = min_recs_season, min_recs_year = min_recs_year,
	  one_season = one_season
	 )
	# TODO: loop all this nonsense??
  if (ctrl$night_thr != 10) {
    warning("Parameter night_thr set to non default value.")
  }
  if (ctrl$temp_classes != 7) {
    warning("Parameter temp_classes set to non default value.")
  }
  if (ctrl$ustar_classes != 20) {
    warning("Parameter ustar_classes set to non default value.")
  }
  if (ctrl$min_recs_temp != 100) {
    warning("Parameter min_recs_temp set to non default value.")
  }
  if (ctrl$min_recs_season != 160) {
    warning("Parameter min_recs_season set to non default value.")
  }
  if (ctrl$min_recs_year != 3000) {
    warning("Parameter min_recs_year set to non default value.")
  }
  return(ctrl)
}

## =============================================================================
#' Calculate Season Factor
#'
#' @description Compute year-spanning or year-bounded season factor by starting
#'   month, year-day, or year and year-day.
#'
#' Original names: usCreateSeasonFactorMonth,
#' usCreateSeasonFactorMonthWithinYear, usCreateSeasonFactorYday,
#' usCreateSeasonFactorYdayYear
#' @author TW
#'
#' @param dates POSIXct vector of length of the data set to be filled,
#'   specifying the center-time of each record.
#' @param month Integer (1-12) vector of length of the data set to be filled,
#'   specifying the month for each record.
#' @param yday Integer (1-366) vector of length of the data set to be filled,
#'   specifying the day of the year for each record.
#' @param year Integer vector of length of the data set to be filled, specifying
#'   the year.
#' @param start_month Integer vector specifying the starting month for each
#'   season, counting from one. Default is (Dec, Jan, Feb); (Mar, Apr, May);
#'   (Jun, Jul, Aug); (Sep, Oct, Nov).
#' @param start_yday Integer vector (1-366) specifying the starting yday for
#'   each season in increasing order.
#'
#' @details
#' For type "month":
#' Calculate factors to denote the season for ustar-filtering by specifying
#' starting months, with continuous seasons spanning year boundaries. If Jan is
#' not a starting month, then the first months of each year will be part of the
#' last period in the year. E.g. with the default the fourth period of the first
#' year consists of Jan, Feb, Dec.
#'
#' For type "ymonth":
#' Calculate factors to denote the season for ustar-filtering by specifying
#' starting months, with seasons not spanning year boundaries. If Jan is not a
#' starting month, then the first months of each year will be part of the last
#' period in the year e.g. with the default the fourth period of the first year
#' consists of Jan, Feb, Dec.
#'
#' For type "yday":
#' With default parameterization, dates are assumed to denote begin or center of
#' the eddy time period. If working with dates that denote the end of the
#' period, use \code{yday = as.POSIXlt(fGetBeginOfEddyPeriod(dates))$yday}.
#'
#' Internally, the timestamp is shifted 15 minutes after the start of each half
#' hour. When providing the \code{dates} argument, the start time may be shifted
#' by \code{dates = data$timestamp + 15 * 60}.
#'
#' @return For types "month" and "ymonth": an integer vector of length
#'   \code{dates}, with each unique value representing one season. For types
#'   "yday" and "year": an integer vector of length \code{nrow(ds)}, each unique
#'   class representing one season.
#'
#' @export
create_sf <- function(dates, month = as.POSIXlt(dates)$mon + 1L,
                      yday = as.POSIXlt(dates)$yday + 1L,
                      year = as.POSIXlt(dates)$year + 1900L,
                      start_month = c(3, 6, 9, 12),
                      start_yday = c(335, 60, 152, 244),
                      type = c("month", "ymonth", "yday", "year")) {
  # Check function inputs
  type <- match.arg(type)
  if (type %in% c("month", "ymonth")) {
    if (length(year) == 1L) year <- rep(year, length(month))
    if (length(month) != length(year) ) {
      stop("'month' and 'year' arguments must have the same length.",
           call. = FALSE)
    }
    if (any(month < 1 | month > 12) ) {
      stop("'month' out of range 1-12.", call. = FALSE)
    }
    if (type == "month") {
      starts <- data.frame(month = sort(unique(start_month)),
                           year = rep(sort(unique(year)),
                                      each = length(start_month)))
      if (starts$month[1] != 1L) {
        starts <- rbind(data.frame(month = 1L, year = starts$year[1]), starts)
      }
      sf <- integer(length(month)) # 0L
      start_ymonths <- starts$year * 1000L + starts$month
      starts$start_ymonths <- start_ymonths
      ymonths <- year * 1000L + month
      for (i in 1:(length(start_ymonths) - 1)) {
        bo <- (ymonths >= start_ymonths[i]) & (ymonths < start_ymonths[i + 1])
        sf[bo] <- starts$year[i] * 1000L + starts$month[i]
      }
      # Last period with no end border defined
      i <- length(start_ymonths)
      bo <- (ymonths >= start_ymonths[i])
      sf[bo] <- starts$year[i] * 1000L + starts$month[i]
      return(as.factor(sf))
    } else if (type == "ymonth") {
      start_month <- sort(unique(start_month))
      boLastPeriod <- month < start_month[1]
      # translate month before the first specified beginning month to be
      # after last specified month (1 becomes 13)
      month[boLastPeriod] <- month[boLastPeriod] + 12
      start_month_add <- c(start_month, start_month[1] + 12)
      sf <- year * 1000L + rep(start_month[1], length(month) )
      for (i in 2:length(start_month) ) {
        bo <- (month >= start_month_add[i]) & (month < start_month_add[i + 1])
        sf[bo] <- year[bo] * 1000L + start_month[i]
      }
      table(sf)
      as.factor(sf)
    }
  } else if (type %in% c("yday", "year")) {
    if (length(year) == 1L) year <- rep(year, length(yday))
    if (length(yday) != length(year)) {
      stop("'month' and 'year' arguments must have the same length.",
           call. = FALSE)
    }
    if (any(yday < 1 | yday > 366)) {
      stop("'yday' out of range 1..366.", call. = FALSE)
    }
    if (type == "yday") {
      starts <- data.frame(yday = sort(unique(start_yday)),
                           year = rep(sort(unique(year)),
                                      each = length(start_yday)))
    } else if (type == "year") {
      starts <- data.frame(yday = c(335, 60, 152, 244), year)
    }
    colnames(starts) <- c("yday", "year")
    if (starts$yday[1] != 1L) {
      starts <- rbind(data.frame(yday = 1L, year = starts$year[1]), starts)
    }
    sf <- integer(length(yday)) # 0L
    start_ydays <- starts$year * 1000L + starts$yday
    starts$start_ydays <- start_ydays
    ydays <- year * 1000L + yday
    for (i in 1:(length(start_ydays) - 1)) {
      bo <- (ydays >= start_ydays[i]) & (ydays < start_ydays[i + 1])
      sf[bo] <- starts$year[i] * 1000L + starts$yday[i]
    }
    # Last period with no defined end border
    i <- length(start_ydays)
    bo <- (ydays >= start_ydays[i])
    sf[bo] <- starts$year[i] * 1000L + starts$yday[i]
    return(as.factor(sf))
  }
}

#' =============================================================================
#' (Internal) Bin NEE for a Number of Ustar Classes
#' =============================================================================
#'
#' Original name: .binUstar
#' @author TW
#'
#' @param NEE Vector with values of Net Ecosystem Exchange.
#' @param ustar Vector with ustar (friction velocity; m2 / s).
#' @param ustar_classes Number of binning classes.
#' @param sorted Set to TRUE if NEE and ustar are already sorted by
#'   increasing ustar values (performance gain).
#'
#' @return
#' List with entries:
#' * bin_avgs Data frame with columns ustar_avg, NEE_avg, recs with one
#'   row for each bin.
#' * ustar_bins Integer vector reporting the bin for each record in Ustar.v
#'
bin_ustar <- function(NEE, ustar,
                      ustar_classes = subset_ustar()$ustar_classes,
                      sorted = FALSE) {
  ds0.F <- data.frame(NEE = NEE, ustar = ustar)
	ds.F <- ds0.F
	# Within data frame sort values by ustar
	if (!isTRUE(sorted)) {
		order_ustar <- order(ustar)
		ds.F <- ds.F[order_ustar, ]
	} else{
	  order_ustar <- TRUE
	}
	# When assigning ustar classes, only start new class when ustar value changes
	ds.F$ustar_class <- bin_unique(ds.F$ustar, nBin = ustar_classes,
	                                         tol = 1e-14)
	bin_avgs <-
	  ds.F %>%
	  group_by(!!sym("ustar_class")) %>%
	  summarise(Ust_avg = mean(!!sym("Ustar"), na.rm = TRUE),
	            NEE_avg = mean(!!sym("NEE"), na.rm = TRUE),
	            nRec = length(!!sym("NEE"))) %>%
		select(-1)  # omit first column
	bins_unsorted <- integer(nrow(ds.F))
	bins_unsorted[order_ustar] <- as.integer(ds.F$ustar_class)
	return(list(bin_avgs = bin_avgs, ustar_bins = bins_unsorted))
}

#' =============================================================================
#' Create Binning Factor and Shorten Following Bins
#' =============================================================================
#'
#' Original name: .binWithEqualValuesBalanced
#' @author
#'
#' @param x Sorted numeric vector to sort into bins.
#' @param bins Intended number of bins.
#' @param tol Distance between successive values of x that are treated as equal.
#' @param floor Set to FALSE to postpone rounding on start and end values.
#'
#' @details
#' Equal values of x end up in the same bin, and shortens the following bins. By
#' not taking the floor, a better distribution of samples across bins is
#' achieved.
#'
#' @return Integer vector of same length as x, with unique value for each bin.
#'
bin_balanced <- function(x, bins, tol = 1e-8, floor = TRUE) {
	if (bins == 1L) return(integer(length(x)))
	binSize <- length(x) / bins
	if (floor) binSize <- floor(binSize)
	breaksX <- which(diff(x) > tol) + 1
	binId <- rep(1L, length(x))
	iBreak <- 1L  # index from which to seek next break
	for (iClass in 2L:as.integer(bins)) {
		start0 <- round((iClass - 1) * binSize) + 1
		iBreak <- .whichValueGreaterEqual(breaksX, start0, iStart = iBreak)
		start1 <- breaksX[iBreak]
		# Find next uStar change at or after position start0
		binId[start1:length(x)] <- iClass
	}
	return(binId)
}

#' =============================================================================
#' Create Binning Factor and Shift Following Bins
#' =============================================================================
#'
#' Original name: .binWithEqualValuesMinRec
#' @author TW
#'
#' @param x Sorted numeric vector to sort into bins
#' @param bins Intended number of bins
#' @param tol Distance between successive values of x that are treated as equal
#'
#' @section Value
#' Integer vector of same length as x, with unique value for each bin. Each bin
#' holds at least floor(length(x) / bins) records, or more if there were values
#' after the bin that were numerically equal to last value of the bin. The
#' actual number of bins might be differnt from argument bins due to numericall
#' equal values and is reported with attribute \code{bins}. Because of using
#' floor in bin-width calculation, the last, i.e. nbin, of the bins may hold
#' more values.
#'
bin_unique <- function(x, bins, tol = 1e-8) {
	length <- length(x)
	binId <- integer(length)
	binSize <- as.integer(floor(length / bins))
	# Positions in x where value is numerically different from following element
	iBreaksX <- which(diff(x) > tol)
	iBreak <- 0L  # start index in iBreaks, to avoid searching the samller el.
	end <- 0L  # index in x, end of the (previous) period
	bin <- 0L  # bin Id
	while (end < length) {
		bin <- bin + 1L
		stat <- end + 1L
		# Same as start + binsSize-1, with counting from 1 instead of 0
		end <- end + binSize
		# Find the next break after end
		iBreak <- .whichValueGreaterEqual(iBreaksX, end, iBreak + 1L)
		if (is.na(iBreak)) {
			# No break found, set period end to vector end and finish. If last bin
		  # length is smaller than 90% of intended binsize, sort records to former
		  # bin.
			if ((length + 1L - start) < binSize * 0.9 && bin != 1L) {
			  bin <- bin - 1L
			}
			binId[start:length] <- bin
			break
		} else {
			end <- iBreaksX[iBreak]  # update end to position with break after it
			binId[start:end] <- bin
		}
	}
	# Do not use more than bins classes; increase the size of the last class
	binId[binId > bins] <- bins
	attr(binId, "bins") <- min(bin, bins)
	return(binId)
}

#' =============================================================================
#' Search for First Element in an Integer Vector Larger than Threshold
#' =============================================================================
#'
#' Original name: .whichValueGreaterEqual
#' @author TW
#'
#' @description For performance reasons call a C++ function that loops across
#'   the vector.
#'
#' @param x Increasingly sorted numeric vector to search.
#' @param thr Integer scalar: searched element will need to be greater
#'   than or equal to this argument
#' @param iStart Index in vector to start search
#'
#' @details Searches a sorted integer vector for the next element that is >= a
#'   threshold in fast C-code.
#'
#' @return A scalar integer: first index in x, that is >= start, and whose
#'   value x[i] is >= threshold. If no index was found, returns NA.
#'
.whichValueGreaterEqual <- function(x, thr, start = 1L) {
  ans <- whichValueGreaterEqualC(
	  as.integer(x), as.integer(thr), as.integer(start) )
  return(ans)
}

## =============================================================================
#'
#'
#'
#' @param x an integer vector
#' @param threshold an integer vector
#' @param iStart an integer vector
#'
#' @export
whichValueGreaterEqualC <- function(x, threshold, iStart) {  # translated
  threshold <- as.integer(threshold)  # translated
  i <- as.integer(iStart)  # translated
  while ((i < length(x)) & (x[i] < threshold)) i + 1  # translated
  if (i < length(x)) {  # translated
    # return index
    return(i)
  } else {
    # no break found
    return(NA)
  }
}

#' =============================================================================
#' Estimate Ustar Threshold for Single Subset (Using FW1 Algorithm)
#' =============================================================================
#'
#' Original name: usEstUstarThresholdSingleFw1Binned
#' @author TW, OM
#'
#' @param ustar_bins Data frame with columns NEE_avg and ustar_avg, of ustar
#'   bins.
#' @param ctrl_est Parameter list, see \code{\link{control_ustar}} for
#'   defaults and description.
#'
#' @return
#'
#' @export
est_ustar_fw1 <-
  function(ustar_bins, ctrl_est = control_ustar()) {
  # Relying on binned NEE and ustar algorithm to check when plateau is reached
	flag <- FALSE
	# For every u * bin compare to avg of subsequent UST_PLATEAU, until found
	u <- 1
  while (!flag) {
    # Only stop if threshold is found
		if (!flag & (ustar_bins$NEE_avg[u] >=
		          (ctrl_est$plateau_crit * mean(ustar_bins$NEE_avg[
		            (u + 1):(u + ctrl_est$plateau_fwd)], na.rm = TRUE)))) {
			# NEE_i >= .95 * avg(i, i + 1, ..., i + 10)  [FW]
			thr <- ustar_bins$ustar_avg[u]
			flag <- TRUE  # set flag for threshold found in this mode
		}
		# Case where no threshold is found by plateau method, use maximum ustar in
    # that T_class.
		if (u == (nrow(ustar_bins) - 1)) {
			thr <- ustar_bins$ustar_avg[u + 1]
			break
		}
		u <- u + 1  # increase index by 1
	}
	return(thr)
}

#' =============================================================================
#' Estimate Ustar Threshold for Single Subset (Using FW2 Algorithm)
#' =============================================================================
#'
#' Original name: usEstUstarThresholdSingleFw2Binned
#' @author TW, OM
#'
#' @param ustar_bins Data frame with columns NEE_avg and ustar_avg, of ustar
#'   bins.
#' @param ctrl_est Parameter list, see \code{\link{control_ustar}} for
#'   defaults and description.
#'
#' @return
#'
#' @export
est_ustar_fw2 <-
  function(ustar_bins, ctrl_est = control_ustar()) {
	# Algorithm to check when plateau is reached
  flag <- FALSE
  # For every u * bin compare to avg of subsequent UST_PLATEAU, until found
  u <- 1
  thr <- NA
  ## Demand that threshold is higher than \code{ctrl_est$minNuStarPlateau}
  ## records. If fewer records
  # FF2 neads at least two bins after threshold
  umax <- nrow(ustar_bins) - max(2L, ctrl_est$minNuStarPlateau)
  while (u <= umax) {
    if (
		(ustar_bins$NEE_avg[u] >= (ctrl_est$plateau_crit *
		    mean(ustar_bins$NEE_avg[(u + 1):(u + ctrl_est$plateau_fwd)],
		         na.rm = TRUE))) &
		(ustar_bins$NEE_avg[u + 1] >= (ctrl_est$plateau_crit *
		    mean(ustar_bins$NEE_avg[(u + 1 + 1):(u + ctrl_est$plateau_fwd + 1)],
		         na.rm = TRUE)))
	) {
    thr <- ustar_bins$ustar_avg[u]
	  break
  }
	u <- u + 1L
  }
  # Case where no threshold is found by plateau method, use maximum ustar in
  # that T_class.
  if (is.na(thr) & !isTRUE(ctrl_est$omit_bins) )
	  thr <- ustar_bins$ustar_avg[u + 1]
  return(thr)
}

#' =============================================================================
#' Extract Mapping Season to UStar Columns from Distribution Result (Annual)
#' =============================================================================
#'
#' Original name: usGetAnnualSeasonUStarMap
#' @author TW
#'
#' @description
#'
#' @param ustar_thr Result of \code{\link{est_ustar_dist}} or
#'   \code{\link{est_ustar_thr}}$ustar_thr.
#'
#' @details
#'
#' @return A data frame with first column the season, and other columns
#'   different ustar threshold estimates.
#'
#' @export
get_annual_map <- function(ustar_thr) {
  year <- ustar_thr[ustar_thr$agg_mod == "year", , drop = FALSE]
  year$season <- NULL
  year$agg_mod <- NULL
  # deprecated: done in est_ustar_thr
  seasons <- ustar_thr[
    ustar_thr$agg_mod == "season",
    c("season", "seasonYear"),
    drop = FALSE
  ]
  res <- merge(seasons, year)
  res$seasonYear <- NULL
  # Transform column names of "x%" to "Ux" with leading zeros
  colnames(res)[-(1:2)] <- (
    gsub(" ", "0", sprintf("U%2s", gsub("%", "", colnames(res)[-(1:2)])))
  )
  return(res)
}

#' =============================================================================
#' Extract Mapping Season to UStar Columns from Distribution Result (Seasonal)
#' =============================================================================
#'
#' Original name: usGetSeasonalSeasonUStarMap
#' @author TW
#'
#' @param ustar_thr Result of \code{\link{est_ustar_dist}} or
#'   \code{\link{est_ustar_thr}}$ustar_thr.
#'
#' @details
#' From result of \code{\link{est_ustar_dist}} omit aggregation model and
#' seasonYear column.
#'
#' @return A data frame with first column the season, and other columns
#'   different ustar threshold estimates. Transform column names of "x%" to "Ux"
#'   with leading zeros.
#'
#' @export
get_seasonal_map <- function(ustar_thr) {
	seasons <- ustar_thr[ustar_thr$agg_mod == "season", , drop = FALSE]
	seasons$seasonYear <- NULL
	seasons$agg_mod <- NULL
	colnames(seasons)[-(1:2)] <- (
	  gsub(" ", "0", sprintf("U%2s", gsub("%", "", colnames(seasons)[-(1:2)])))
	)
	return(seasons)
}

#' =============================================================================
#' Estimate Distribution of Ustar Threshold by Bootstrapping
#' =============================================================================
#'
#' Original name: sEddyProc_sEstimateUstarScenarios
#' @author TW
#'
#' @param ctrl_est Control parameters for estimating ustar on a single binned
#'   series, see \code{\link{control_ustar}}.
#' @param ctrl_sub Control parameters for subsetting time series (number of
#'   temperature and ustar classes), see \code{\link{subset_ustar}}.
#' @param ustar Column name for ustar.
#' @param NEE Column name for NEE.
#' @param Tair Column name for air temperature.
#' @param Rg Column name for solar radiation.
#' @param ... Further arguments to \code{\link{est_ustar_thr}}.
#' @param sf Factor of seasons to split (data is resampled only within the
#'   seasons).
#' @param reps Number of repetitions in the bootstrap.
#' @param probs Quantiles of the bootstrap sample to return. Default is the 5%,
#'   median and 95% of the bootstrap.
#' @param verbose Set to FALSE to omit printing progress.
#'
#' @details
#' The choice of the criterion for sufficiently turbulent conditions (ustar >
#' chosen threshold) introduces large uncertainties in calculations based on
#' gap-filled Eddy data. Hence, it is good practice to compare derived
#' quantities based on gap-filled data using a range of ustar threshold
#' estimates.
#'
#' This method explores the probability density of the threshold by repeating
#' its estimation on a bootstrapped sample. By default it returns the 90%
#' confidence interval (argument \code{probs}). For larger intervals the sample
#' number must be increased (argument \code{probs}).
#'
#' \describe{
#' \item{Quality Assurance}{
#' If more than \code{ctrl_est$min_boot} (default 40%) did not
#' report a threshold, no quantiles (i.e. NA) are reported.
#' }}
#'
#' @return A data frame with columns \code{agg_mod}, \code{year}, and
#'   \code{ustar} estimate based on the non-resampled data. The other columns
#'   correspond to the quantiles of ustar estimate for given probabilities
#'   (argument \code{probs}) based on the distribution of estimates using
#'   resampled the data.
#'
#' @export
est_ustar_scens <- function(temp, ctrl_est = control_ustar(),
                            ctrl_sub = subset_ustar(), ustar = "ustar",
                            NEE = "NEE", Tair = "Tair", Rg = "Rg", ...,
                            sf = create_sf(data$timestamp, type = "month"),
                            reps = 200L, probs = c(0.05, 0.5, 0.95),
                            verbose = TRUE) {
  temp$season <- sf
  ds <- data[, c("timestamp", ustar, NEE, Tair, Rg)]
  colnames(ds) <- c("timestamp", "ustar", "NEE", "Tair", "Rg")
  ds$sf <- temp$season
  res0 <- suppressMessages(est_ustar_thr(
    ustar = ustar,
    NEE = NEE,
    Tair = Tair,
    Rg = Rg,
    ...,
    ctrl_est = ctrl_est, ctrl_sub = ctrl_sub,
    sf = NULL
  ))
  pos_agg <- which(res0$agg_mod == "single")
  pos_years <- which(res0$agg_mod == "year")
  pos_seasons <- which(res0$agg_mod == "season")
  years0 <- res0$seasonYear[pos_years]
  seasons0 <- res0$season[pos_seasons]
  wrapper <- function(sample, ...) {
    ds2 <- ds
    season_boot <-
      ds2 %>%
      split(.$sf) %>%
      map_df(function(dss) {
        sample <- sample.int(nrow(dss), replace = TRUE)
        dss[sample, , drop = FALSE]
      })
    if (isTRUE(verbose)) message(".", appendLF = FALSE)
    res <- est_ustar_thr(
      season_boot, ...,
      sf = season_boot$season,
      ctrl_est = ctrl_est, ctrl_sub = ctrl_sub
    )
    gc()
    # Check if years and seasons have been calculated differently due to
    # subsetting with too few values within a season, report NA for those cases
    res_agg <- res$ustar_thr$ustar[pos_agg]
    years <- res$ustar_thr$year[pos_years]
    res_years <- structure(
      if (all(years == years0)) {
        res$ustar_thr$ustar[pos_years]
      } else {
        rep(NA, length(years0))
      },
      names = as.character(years0)
    )
    res_seasons <- structure(
      if (
        nrow(res$ustar_thr) == nrow(res0) &&
        all((seasons <- res$ustar_thr$season[pos_seasons]) == seasons0)
      ) {
        res$ustar_thr$ustar[pos_seasons]
      } else {
        rep(NA, length(seasons0))
      },
      names = as.character(seasons0)
    )
    return(c(agg_years = res_agg, res_years, res_seasons))
  }
  # Collect into one big matrix
  ustar0 <- res0$ustar[c(pos_agg, pos_years, pos_seasons)]
  Ustar.l <- suppressMessages(Ustar.l <- lapply(1:(reps - 1), wrapper, ...))
  if (isTRUE(verbose)) message("") # line break
  stat <- do.call(rbind, c(list(ustar0), Ustar.l))
  # workaround: if probs is a scalar, apply returns vector without names in
  # order to get the matrix in all cases, prepend extend probs and delete the
  # corresponding row afterwards
  res_quants0 <- apply(stat, 2, quantile, probs = c(0, probs), na.rm = TRUE)
  res_quants <- t(res_quants0[-1, , drop = FALSE])
  invalid <- colSums(is.finite(stat)) / nrow(stat) < ctrl_est$min_boot
  res_quants[invalid, ] <- NA
  rownames(res_quants) <- NULL
  res_df <- cbind(res0, res_quants)
  message(paste(
    "Estimated ustar distribution of:\n",
    paste(
      capture.output(res_df[res_df$agg_mod == "single", -(1:3)]),
      collapse = "\n"
    ), "\nby using ", reps, "bootstrap samples and controls:\n",
    paste(capture.output(unlist(ctrl_sub)), collapse = "\n")
  ))
  # .self$sSetUstarScenarios(get_annual_map(res_df))
  return(res_df)
}
grahamstewart12/tidyflux documentation built on June 4, 2019, 7:44 a.m.