# Copyright 2019 Province of British Columbia
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
#' @title Perform a custom volume frequency analysis
#'
#' @description Performs a volume frequency analysis on custom data. Defaults to ranking by minimums; use \code{use_max} for to
#' rank by maximum flows. Calculates the statistics from events and flow values provided. Columns of events (e.g. years), their
#' values (minimums or maximums), and identifiers (low-flows, high-flows, etc.). Function will calculate using all values in the
#' provided data (no grouped analysis). Analysis methodology replicates that from
#' \href{https://www.hec.usace.army.mil/software/hec-ssp/}{HEC-SSP}. Returns a list of tibbles and plots.
#'
#' @param data A data frame of data that contains columns of events, flow values, and measures (data type).
#' @param events Column in \code{data} that contains event identifiers, typically year values. Default \code{'Year'}.
#' @param values Column in \code{data} that contains numeric flow values, in units of cubic metres per second. Default \code{'Value'}.
#' @param measures Column in \code{data} that contains measure identifiers (example data: '7-day low' or 'Annual Max'). Can have
#' multiple measures (ex. '7-day low' and '30-day low') in column if multiple statistics are desired. Default \code{'Measure'}.
#' @param use_max Logical value to indicate using maximums rather than the minimums for analysis. Default \code{FALSE}.
#' @param use_log Logical value to indicate log-scale transforming of flow data before analysis. Default \code{FALSE}.
#' @param prob_plot_position Character string indicating the plotting positions used in the frequency plots, one of \code{'weibull'},
#' \code{'median'}, or \code{'hazen'}. Points are plotted against (i-a)/(n+1-a-b) where \code{i} is the rank of the value; \code{n} is the
#' sample size and \code{a} and \code{b} are defined as: (a=0, b=0) for Weibull plotting positions; (a=.2; b=.3) for Median
#' plotting positions; and (a=.5; b=.5) for Hazen plotting positions. Default \code{'weibull'}.
#' @param prob_scale_points Numeric vector of probabilities to be plotted along the X axis in the frequency plot. Inverse of
#' return period. Default \code{c(.9999, .999, .99, .9, .5, .2, .1, .02, .01, .001, .0001)}.
#' @param compute_fitting Logical value to indicate whether to fit plotting positions to a distribution. If 'FALSE' the output will
#' return only the data, plotting positions, and plot. Default \code{TRUE}.
#' @param fit_distr Character string identifying the distribution to fit annual data, one of \code{'PIII'} (Log Pearson Type III)
#' or \code{'weibull'} (Weibull) distributions. Default \code{'PIII'}.
#' @param fit_distr_method Character string identifying the method used to fit the distribution, one of \code{'MOM'} (method of
#' moments) or \code{'MLE'} (maximum likelihood estimation). Selected as \code{'MOM'} if \code{fit_distr ='PIII'} (default) or
#' \code{'MLE'} if \code{fit_distr = 'weibull'}.
#' @param fit_quantiles Numeric vector of quantiles to be estimated from the fitted distribution.
#' Default \code{c(.975, .99, .98, .95, .90, .80, .50, .20, .10, .05, .01)}.
#' @param plot_curve Logical value to indicate plotting the computed curve on the probability plot. Default \code{TRUE}.
#' @param plot_axis_title Character string of the plot y-axis title. Default \code{'Discharge (cms)'}.
#'
#' @return A list with the following elements:
#' \item{Freq_Analysis_Data}{Data frame with provided data for analysis.}
#' \item{Freq_Plot_Data}{Data frame with plotting positions used in frequency plot.}
#' \item{Freq_Plot}{ggplot2 object with plotting positions and (optional) fitted curve.}
#' \item{Freq_Fitting}{List of fitted objects from fitdistrplus.}
#' \item{Freq_Fitted_Quantiles}{Data frame with fitted quantiles.}
#'
#'
#' @examples
#' \dontrun{
#'
#' # Working example:
#'
#' # Calculate some values to use for a frequency analysis
#' # (requires years, values for those years, and the name of the measure/metric)
#' low_flows <- calc_annual_lowflows(station_number = "08NM116",
#' start_year = 1980,
#' end_year = 2000,
#' roll_days = 7)
#' low_flows <- dplyr::select(low_flows, Year, Value = Min_7_Day)
#' low_flows <- dplyr::mutate(low_flows, Measure = "7-Day")
#'
#' # Compute the frequency analysis using the default parameters
#' results <- compute_frequency_analysis(data = low_flows,
#' events = Year,
#' values = Value,
#' measure = Measure)
#'
#' }
#' @export
compute_frequency_analysis <- function(data,
events = Year,
values = Value,
measures = Measure,
use_max = FALSE,
use_log = FALSE,
prob_plot_position = c("weibull", "median", "hazen"),
prob_scale_points = c(.9999, .999, .99, .9, .5, .2, .1, .02, .01, .001, .0001),
compute_fitting = TRUE,
fit_distr = c("PIII", "weibull"),
fit_distr_method = ifelse(fit_distr == "PIII", "MOM", "MLE"),
fit_quantiles = c(.975, .99, .98, .95, .90, .80, .50, .20, .10, .05, .01),
plot_curve = TRUE,
plot_axis_title = "Discharge (cms)"){
# replicate the frequency analysis of the HEC-SSP program
# refer to Chapter 7 of the user manual
## ARGUMENT CHECKS
## ---------------
# plotting
if (!is.logical(use_log))
stop("use_log must be logical (TRUE/FALSE).", call. = FALSE)
if (!is.logical(use_max))
stop("use_max must be logical (TRUE/FALSE).", call. = FALSE)
if (!all(prob_plot_position %in% c("weibull","median","hazen")))
stop("prob_plot_position must be one of weibull, median, or hazen.", call. = FALSE)
if (!is.numeric(prob_scale_points))
stop("prob_scale_points must be numeric and between 0 and 1 (not inclusive).", call. = FALSE)
if (!all(prob_scale_points > 0 & prob_scale_points < 1))
stop("prob_scale_points must be numeric and between 0 and 1 (not inclusive).", call. = FALSE)
# fitting
logical_arg_check(compute_fitting)
if (compute_fitting) {
if (!all(fit_distr %in% c("weibull", "PIII")))
stop("fit_distr must be one of weibull or PIII.", call. = FALSE)
if (!is.numeric(fit_quantiles))
stop("fit_quantiles must be numeric and between 0 and 1 (not inclusive).", call. = FALSE)
if (!all(fit_quantiles > 0 & fit_quantiles < 1))
stop("fit_quantiles must be numeric and between 0 and 1 (not inclusive).", call. = FALSE)
if (fit_distr[1] == 'weibull' & use_log)
stop("Cannot fit Weibull distribution on log-scale.", call. = FALSE)
if (fit_distr[1] != "PIII" & fit_distr_method[1] == "MOM")
stop('MOM only can be used with PIII distribution.', call. = FALSE)
if (!is.logical(plot_curve))
stop("plot_curve must be logical (TRUE or FALSE).")
}
# Check if data is provided
if (missing(data))
stop("A data frame of annual data must be provided using the 'data' argument.", call. = FALSE)
if (!is.data.frame(data))
stop("data argument is not a data frame.", call. = FALSE)
data <- as.data.frame(data)
# Check the values and events columns
if (!as.character(substitute(events)) %in% names(data))
stop("Events column not found in data frame. Rename events column to 'Years' or identify the column using 'events' argument.", call. = FALSE)
if (!as.character(substitute(values)) %in% names(data))
stop("Values not found in data frame. Rename values column to 'Value' or identify the column using 'values' argument.", call. = FALSE)
names(data)[names(data) == as.character(substitute(values))] <- "Value"
if (!is.numeric(data$Value))
stop("Values in values column must be numeric.", call. = FALSE)
if (!as.character(substitute(measures)) %in% names(data))
stop("Measures not found in data frame. Rename measure column to 'Measure' or identify the column using 'measures' argument.", call. = FALSE)
names(data)[names(data) == as.character(substitute(measures))] <- "Measure"
# Set the Q_stat dataframe
Q_stat <- data
# fitting
if (compute_fitting) {
if(fit_distr[1] == 'weibull' & any(Q_stat$Value < 0, na.rm = TRUE))
stop("Cannot fit weibull distribution with negative flow values.", call. = FALSE)
if(fit_distr[1] == 'PIII' & any(Q_stat$Value <= 0, na.rm = TRUE))
stop("Cannot fit 'PIII' distribution with negative or zero flow values.", call. = FALSE)
## Define functions for analysis
##------------------------------
# This code chunk removes error for "no visible binding for '<<-' assignment to 'dPIII'" etc
dPIII <- NULL
pPIII <- NULL
qPIII <- NULL
mPIII <- NULL
# Define the log=Pearson III function needed for fitting at the GLOBAL environment level
dPIII <<- function(x, shape, location, scale) PearsonDS::dpearsonIII(x, shape, location, scale, log = FALSE)
pPIII <<- function(q, shape, location, scale) PearsonDS::ppearsonIII(q, shape, location, scale, lower.tail = TRUE, log.p = FALSE)
qPIII <<- function(p, shape, location, scale) PearsonDS::qpearsonIII(p, shape, location, scale, lower.tail = TRUE, log.p = FALSE)
mPIII <<- function(order, shape, location, scale){
# compute the empirical first 3 moments of the PIII distribution
if(order == 1) return(location + shape * scale)
if(order == 2) return(scale * scale * shape)
if(order == 3) return(2 / sqrt(shape) * sign(scale))
}
}
#--------------------------------------------------------------
# Plot the data on the distrubtion
# Compute the summary table for output
Q_stat_output <- tidyr::spread(Q_stat, Measure, Value)
# See if a (natural) log-transform is to be used in the frequency analysis?
# This flag also controls how the data is shown in the frequency plot
if(use_log)Q_stat$Value <- log(Q_stat$Value)
# make the plot. Remove any missing or infinite or NaN Values
Q_stat <- Q_stat[is.finite(Q_stat$Value),] # remove missing/ Inf/ NaN Values
# get the plotting positions
# From the HEC-SSP package, the plotting positions are (m-a)/(n+1-a-b)
a <- 0; b <- 0
if(prob_plot_position[1] == 'weibull'){a <- 0.0; b <- 0.0}
if(prob_plot_position[1] == 'median' ){a <- 0.3; b <- 0.3}
if(prob_plot_position[1] == 'hazen' ){a <- 0.5; b <- 0.5}
plotdata <- plyr::ddply(Q_stat, "Measure", function(x, a, b, use_max){
# sort the data
x <- x[ order(x$Value),]
x$prob <- ((seq_len(length(x$Value))) - a)/((length(x$Value) + 1 - a - b))
if(use_max)x$prob <- 1 - x$prob # they like to use p(exceedance) if using a minimum
#x$dist.prob <- stats::qnorm(1 - x$prob) temporarilty remove
x$Return_P <- 1/x$prob
x
}, a = a, b = b, use_max = use_max)
# change the measure labels in the plot
plotdata2 <- plotdata
# Setting dates and Values to actual Values. Some sort of environment() error in plotting due to function environment with dates and Values
# Error in as.list.environment(x, all.names = TRUE) :
# object 'Value' not found
events <- "Year"
values <- "Value"
measures <- "Measure"
plotdata2$Measure <- factor(plotdata2$Measure, levels = unique(plotdata2$Measure))
freqplot <- ggplot2::ggplot(data = plotdata2,
ggplot2::aes(x = prob, y = Value, group = Measure,
#fill = Measure,
color = Measure),
environment = environment())+
ggplot2::geom_point(size = 2)+
#ggplot2::geom_point(size =2, shape = 21, colour="black")+
ggplot2::xlab("Probability")+
ggplot2::scale_x_continuous(trans = scales::probability_trans("norm", lower.tail = FALSE),
breaks = prob_scale_points,
sec.axis = ggplot2::dup_axis(
name = 'Return Period',
labels = function(x){ifelse(1/x < 2, round(1/x,2), round(1/x,0))}))+
#ggplot2::scale_color_brewer(palette = "Set1") +
ggplot2::scale_color_viridis_d(end = ifelse(length(unique(plotdata2$Measure)) <= 2, 0.6,
ifelse(length(unique(plotdata2$Measure)) <= 3, 0.75, 0.94))) +
#ggplot2::scale_fill_viridis_d() +
ggplot2::theme_bw() +
ggplot2::labs(color = paste0('Events')) +
#ggplot2::guides(fill = "none")+
ggplot2::theme(panel.border = ggplot2::element_rect(colour = "black", fill = NA, size = 1),
panel.grid = ggplot2::element_line(size = .2),
axis.title = ggplot2::element_text(size = 12),
axis.text = ggplot2::element_text(size = 10),
axis.title.x.top = ggplot2::element_text(size = 12),
#legend.position = "right",
#legend.spacing = ggplot2::unit(0, "cm"),
#legend.justification = "right",
legend.text = ggplot2::element_text(size = 10),
legend.title = ggplot2::element_text(size = 10))
legend.title.align <- 1
if(!use_max){ freqplot <- freqplot + ggplot2::theme(legend.justification = c(1, 1), legend.position = c(.98, .98))}
if(use_max){ freqplot <- freqplot + ggplot2::theme(legend.justification = c(1,0), legend.position = c(.98, 0.02))}
if(!use_log){ freqplot <- freqplot + ggplot2::scale_y_log10(breaks = scales::pretty_breaks(n = 10),
labels = scales::label_number(scale_cut = append(scales::cut_short_scale(),1,1)))}
if(use_log){ freqplot <- freqplot + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10),
labels = scales::label_number(scale_cut = append(scales::cut_short_scale(),1,1)))}
# if(use_log & use_max ){freqplot <- freqplot + ggplot2::ylab(expression(lnDischarge~(m^3/s)))} # adjust the Y axis label
# if(use_log & !use_max){freqplot <- freqplot + ggplot2::ylab(expression(lnDischarge~(m^3/s)))}
# if(!use_log & use_max ){freqplot <- freqplot + ggplot2::ylab(expression(Discharge~(m^3/s)))}
# if(!use_log & !use_max){freqplot <- freqplot + ggplot2::ylab(expression(Discharge~(m^3/s)))}
if(use_log & use_max ){freqplot <- freqplot + ggplot2::ylab(paste0("ln", plot_axis_title))} # adjust the Y axis label
if(use_log & !use_max){freqplot <- freqplot + ggplot2::ylab(paste0("ln", plot_axis_title))}
if(!use_log & use_max ){freqplot <- freqplot + ggplot2::ylab(paste0(plot_axis_title))}
if(!use_log & !use_max){freqplot <- freqplot + ggplot2::ylab(paste0(plot_axis_title))}
if (compute_fitting) {
#--------------------------------------------------------------
# fit the distribution to each measure
# log-Pearson III implies that the log(x) has a 3-parameter gamma distribution
ePIII <- function(x, order){
# compute (centered) empirical centered moments of the data
if(order == 1) return(mean(x))
if(order == 2) return(stats::var(x))
if(order == 3) return(e1071::skewness(x, type = 2))
}
fit <- plyr::dlply(Q_stat, "Measure", function(x, distr, fit_method){
start <- NULL
# PIII is fit to log-of Values unless use_log has been set, in which case data has previous been logged
if(distr == 'PIII' & !use_log){x$Value <- log10(x$Value)}
# get starting Values
if(distr == 'PIII'){
# Note that the above forgot to mulitply the scale by the sign of skewness .
# Refer to Page 24 of the Bulletin 17c
m <- mean(x$Value)
v <- stats::var(x$Value)
s <- stats::sd(x$Value)
g <- e1071::skewness(x$Value, type = 2)
# This can be corrected, but HEC Bulletin 17b does not do these corrections
# Correct the sample skew for bias using the recommendation of
# Bobee, B. and R. Robitaille (1977). "The use of the Pearson Type 3 and Log Pearson Type 3 distributions revisited."
# Water Resources Reseach 13(2): 427-443, as used by Kite
#n <- length(x$Value)
#g <- g*(sqrt(n*(n-1))/(n-2))*(1+8.5/n)
# We will use method of moment estimates as starting Values for the MLE search
my_shape <- (2 / g) ^ 2
my_scale <- sqrt(v) / sqrt(my_shape) * sign(g)
my_location <- m - my_scale * my_shape
start <- list(shape = my_shape, location = my_location, scale = my_scale)
}
if(fit_method == "MLE") {fit <- fitdistrplus::fitdist(x$Value, distr, start = start, control = list(maxit = 1000)) }# , trace=1, REPORT=1))
if(fit_method == "MOM") {fit <- fitdistrplus::fitdist(x$Value, distr, start = start, calcvcov = FALSE,
method = "mme", order = 1:3, memp = ePIII, control = list(maxit = 1000))
} # fixed at MOM estimates
fit
}, distr = fit_distr[1], fit_method = fit_distr_method[1])
#--------------------------------------------------------------
# extracted the fitted quantiles from the fitted distribution
fitted_quantiles <- plyr::ldply(names(fit), function (measure, prob, fit, use_max, use_log){
# get the quantiles for each model
x <- fit[[measure]]
# if fitting minimums then you want EXCEEDANCE probabilities
if(use_max) prob <- 1 - prob
quant <- stats::quantile(x, prob = prob)
quant <- unlist(quant$quantiles)
if(x$distname == 'PIII' & !use_log)quant <- 10 ^ quant # PIII was fit to the log-Values
if(use_max) prob <- 1 - prob # reset for adding to data frame
if(use_log) quant <- exp(quant) # transforma back to original scale
res <- data.frame(Measure = measure, distr = x$distname, prob = prob, quantile = quant , stringsAsFactors = FALSE)
rownames(res) <- NULL
res
}, prob = fit_quantiles, fit = fit, use_max = use_max, use_log = use_log)
# get the transposed version
fitted_quantiles$Return_P <- 1 / fitted_quantiles$prob
fitted_quantiles$Measure <- factor(fitted_quantiles$Measure, levels = unique(fitted_quantiles$Measure))
fitted_quantiles_output <- tidyr::spread(fitted_quantiles, Measure, quantile)
fitted_quantiles_output <- dplyr::rename(fitted_quantiles_output,
Distribution = distr,
Probability = prob,
"Return Period" = Return_P)
## Add fitted curves to the freqplot
fit_quantiles_plot <- seq(to = 0.99, from = 0.01, by = .005)
fitted_quantiles_plot <- plyr::ldply(names(fit), function (measure, prob, fit, use_max, use_log){
# get the quantiles for each model
x <- fit[[measure]]
# if fitting minimums then you want EXCEEDANCE probabilities
if(use_max) prob <- 1 - prob
quant <- stats::quantile(x, prob = prob)
quant <- unlist(quant$quantiles)
if(x$distname == 'PIII' & !use_log)quant <- 10 ^ quant # PIII was fit to the log-Values
if(use_max) prob <- 1 - prob # reset for adding to data frame
#if(use_log) quant <- exp(quant) # transforma back to original scale #commented out sep10 by JG
res <- data.frame(Measure = measure, distr = x$distname, prob = prob, quantile = quant , stringsAsFactors = FALSE)
rownames(res) <- NULL
res
}, prob = fit_quantiles_plot, fit = fit, use_max = use_max, use_log = use_log)
if (plot_curve) {
fitted_quantiles_plot$Measure <- factor(fitted_quantiles_plot$Measure, levels = unique(plotdata2$Measure))
freqplot <- freqplot +
ggplot2::geom_line(data = fitted_quantiles_plot, ggplot2::aes(x = prob, y = quantile, group = Measure, color = Measure),
size = 1) +
ggplot2::labs(color = paste0('Events and\nComputed Curve'))
}
}
# Other modifications for outputs
row.names(Q_stat) <- seq_len(nrow(Q_stat))
plotdata <- dplyr::rename(plotdata,
Probability = prob,
"Return Period" = Return_P)
if (compute_fitting) {
freq_results <- list(Freq_Analysis_Data = dplyr::as_tibble(Q_stat),
Freq_Plot_Data = dplyr::as_tibble(plotdata), # has the plotting positions for each point in frequency analysis
Freq_Plot = freqplot,
Freq_Fitting = fit, # list of fits of freq.distr to each measure
Freq_Fitted_Quantiles = dplyr::as_tibble(fitted_quantiles_output)#, # fitted quantiles and their transposition
)
} else {
freq_results <- list(Freq_Analysis_Data = dplyr::as_tibble(Q_stat),
Freq_Plot_Data = dplyr::as_tibble(plotdata), # has the plotting positions for each point in frequency analysis
Freq_Plot = freqplot)
}
freq_results
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.