fasstr: Analyze, Summarize, and Visualize Daily Streamflow Data

Documented in compute_annual_trends

# Copyright 2019 Province of British Columbia
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
# http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.



#' @title Calculate prewhitened nonlinear annual trends on streamflow data
#'
#' @description Calculates prewhitened nonlinear trends on annual streamflow data. Uses the
#'    \href{https://CRAN.R-project.org/package=zyp}{\code{zyp}} package to calculate trends. Review \code{zyp} for more information
#'    Calculates statistics from all values, unless specified. Returns a list of tibbles and plots.
#'    All annual statistics calculated using the \code{calc_all_annual_stats()} function which uses the following 
#'    \code{fasstr} functions:
#' \itemize{
#'  \item{\code{calc_annual_stats()}}
#'  \item{\code{calc_annual_lowflows()}}
#'  \item{\code{calc_annual_cumulative_stats()}}
#'  \item{\code{calc_annual_flow_timing()}}
#'  \item{\code{calc_monthly_stats()}}
#'  \item{\code{calc_annual_normal_days()}}
#'  }
#' 
#' @inheritParams calc_all_annual_stats
#' @param months Numeric vector of months to include in analysis. For example, \code{3} for March, \code{6:8} for Jun-Aug or 
#'    \code{c(10:12,1)} for first four months (Oct-Jan) when \code{water_year_start = 10} (Oct). Default summarizes all 
#'    months (\code{1:12}). If not all months, seasonal total yield and volumetric flows will not be included.
#' @param zyp_method Character string identifying the prewhitened trend method to use from \code{zyp}, either \code{'zhang'}
#'     or \code{'yuepilon'}. \code{'zhang'} is recommended over \code{'yuepilon'} for hydrologic applications (Bürger 2017; 
#'     Zhang and Zwiers 2004). Required.
#' @param include_plots Logical value indicating if annual trending plots should be included. Default \code{TRUE}.
#' @param zyp_alpha Numeric value of the significance level (ex. \code{0.05}) of when to plot a trend line. Leave blank for no line.
#' 
#' @return A list of tibbles and optional plots from the trending analysis including:
#'   \item{Annual_Trends_Data}{a tibble of the annual statistics used for trending}
#'   \item{Annual_Trends_Results}{a tibble of the results of the zyp trending analysis}
#'   \item{Annual_*}{each ggplot2 object for each annual trended statistic}
#' 
#' @references References:
#' \itemize{
#'  \item{Büger, G. 2017. On trend detection. Hydrological Processes 31, 4039–4042. https://doi.org/10.1002/hyp.11280.}
#'  \item{Sen, P.K., 1968. Estimates of the Regression Coefficient Based on Kendall's Tau. Journal of the 
#'        American Statistical Association Vol. 63, No. 324: 1379-1389.}
#'  \item{Wang, X.L. and Swail, V.R., 2001. Changes in extreme wave heights in northern hemisphere oceans and 
#'        related atmospheric circulation regimes. Journal of Climate, 14: 2204-2221.}
#'  \item{Yue, S., P. Pilon, B. Phinney and G. Cavadias, 2002. The influence of autocorrelation on the ability
#'        to detect trend in hydrological series. Hydrological Processes, 16: 1807-1829.}
#'  \item{Zhang, X., Vincent, L.A., Hogg, W.D. and Niitsoo, A., 2000. Temperature and Precipitation Trends in
#'        Canada during the 20th Century. Atmosphere-Ocean 38(3): 395-429.}
#'  \item{Zhang, X., Zwiers, F.W., 2004. Comment on “Applicability of prewhitening to eliminate the influence of serial 
#'        correlation on the Mann-Kendall test” by Sheng Yue and Chun Yuan Wang. Water Resources Research 40. 
#'        https://doi.org/10.1029/2003WR002073.}
#'        }
#'      
#' @seealso \code{\link[zyp]{zyp-package}}, 
#'          \code{\link{calc_all_annual_stats}}
#'   
#' @examples
#' \dontrun{
#' 
#' # Working examples:
#' 
#' # Compute trends statistics using a data frame and data argument with defaults
#' flow_data <- tidyhydat::hy_daily_flows(station_number = "08NM116")
#' trends <- compute_annual_trends(data = flow_data,
#'                                 zyp_method = "zhang")
#' 
#' # Compute trends statistics using station_number with defaults
#' trends <- compute_annual_trends(station_number = "08NM116",
#'                                 zyp_method = "zhang")
#'                       
#' # Compute trends statistics and plot a trend line if the significance is less than 0.05
#' trends <- compute_annual_trends(station_number = "08NM116",
#'                                 zyp_method = "zhang",
#'                                 zyp_alpha = 0.05)
#'                       
#' # Compute trends statistics and do not plot the results
#' trends <- compute_annual_trends(station_number = "08NM116",
#'                                 zyp_method = "zhang",
#'                                 include_plots = FALSE)
#' 
#' }                
#' @export



compute_annual_trends <- function(data,
                                  dates = Date,
                                  values = Value,
                                  groups = STATION_NUMBER,
                                  station_number,
                                  zyp_method,
                                  basin_area, 
                                  water_year_start = 1,
                                  start_year,
                                  end_year,
                                  exclude_years,
                                  months = 1:12,
                                  annual_percentiles = c(10,90),
                                  monthly_percentiles = c(10,20),
                                  stats_days = 1,
                                  stats_align = "right",
                                  lowflow_days = c(1,3,7,30),
                                  lowflow_align = "right",
                                  timing_percent = c(25,33,50,75),
                                  normal_percentiles = c(25,75),
                                  complete_years = FALSE,
                                  ignore_missing = FALSE,
                                  allowed_missing_annual = ifelse(ignore_missing,100,0),
                                  allowed_missing_monthly = ifelse(ignore_missing,100,0),
                                  include_plots = TRUE,
                                  zyp_alpha){       
  
  
  
  ## ARGUMENT CHECKS
  ## ---------------
  
  if (missing(data)) {
    data <- NULL
  }
  if (missing(station_number)) {
    station_number <- NULL
  }
  if (missing(start_year)) {
    start_year <- 0
  }
  if (missing(end_year)) {
    end_year <- 9999
  }
  if (missing(exclude_years)) {
    exclude_years <- NULL
  }
  if (missing(basin_area)) {
    basin_area <- NA
  }
  if (missing(zyp_method)) {
    zyp_method <- NA
  }
  if (missing(zyp_alpha)) {
    zyp_alpha <- NA
  }
  
  zyp_method_checks(zyp_method)
  zyp_alpha_checks(zyp_alpha)
  if (!is.logical(include_plots))         
    stop("include_plots must be logical (TRUE/FALSE).", call. = FALSE)
  
  
  ## CHECKS ON FLOW DATA
  ## -------------------
  
  # Check if data is provided and import it
  flow_data <- flowdata_import(data = data, 
                               station_number = station_number)
  
  # Save the original columns (to check for STATION_NUMBER col at end) and ungroup if necessary
  orig_cols <- names(flow_data)
  flow_data <- dplyr::ungroup(flow_data)
  
  # Check and rename columns
  flow_data <- format_all_cols(data = flow_data,
                               dates = as.character(substitute(dates)),
                               values = as.character(substitute(values)),
                               groups = as.character(substitute(groups)),
                               rm_other_cols = TRUE)
  
  trends_data <- calc_all_annual_stats(data = flow_data,
                                       basin_area = basin_area, 
                                       water_year_start = water_year_start,
                                       start_year = start_year,
                                       end_year = end_year,
                                       exclude_years = exclude_years,
                                       months = months,
                                       annual_percentiles = annual_percentiles,
                                       monthly_percentiles = monthly_percentiles,
                                       stats_days = stats_days,
                                       stats_align = stats_align,
                                       lowflow_days = lowflow_days,
                                       lowflow_align = lowflow_align,
                                       timing_percent = timing_percent,
                                       normal_percentiles = normal_percentiles,
                                       transpose = TRUE,
                                       complete_years = complete_years,
                                       ignore_missing = ignore_missing,
                                       allowed_missing_annual = allowed_missing_annual,
                                       allowed_missing_monthly = allowed_missing_monthly)
  
  # Compute some summary stats on the input data
  colnames(trends_data)[2] <- "Statistic"
  trends_data_summary <- tidyr::gather(trends_data, Year, Value, 3:ncol(trends_data))
  trends_data_summary <- trends_data_summary[stats::complete.cases(trends_data_summary$Value), ]
  trends_data_summary <- dplyr::summarise(dplyr::group_by(trends_data_summary, STATION_NUMBER, Statistic),
                                          min_year = as.numeric(min(Year, na.rm = TRUE)),
                                          max_year = as.numeric(max(Year, na.rm = TRUE)),
                                          n_years = sum(!is.na(Value)),
                                          mean = mean(Value, na.rm = TRUE),
                                          median = stats::median(Value, na.rm = TRUE),
                                          min = min(Value, na.rm = TRUE),
                                          max = max(Value, na.rm = TRUE))
  
  # Complete trends analysis
  trends_results <- zyp::zyp.trend.dataframe(indat = trends_data,
                                             metadata.cols = 2,
                                             method = zyp_method)
  
  
  # Merge the summary stats with the results
  trends_results <- merge(trends_results, trends_data_summary, by = c("STATION_NUMBER", "Statistic"), all = TRUE)
  
  # Order the list of stats in order of all_stats
  trends_results <- dplyr::arrange(trends_results, STATION_NUMBER, Statistic)
  
  
  # Create plots if required
  if (include_plots) {
    
    # Create the list to place all plots
    plots_list <- list()
    
    for (stn in unique(trends_results$STATION_NUMBER)) {
      
      trends_results_stn <- dplyr::filter(trends_results, STATION_NUMBER == stn)
      trends_results_stn <- dplyr::select(trends_results_stn, -STATION_NUMBER)
      
      trends_data_stn <- dplyr::filter(trends_data, STATION_NUMBER == stn)
      trends_data_stn <- dplyr::select(trends_data_stn, -STATION_NUMBER)
      
      ## PLOT TRENDS
      ## -----------
      
      # Set data for plotting
      trends_data_stn <- tidyr::gather(trends_data_stn, Year, Value, -1)
      trends_data_stn <- dplyr::filter(trends_data_stn, Year >= min(trends_results_stn$min_year, na.rm = TRUE))
      
      trends_data_stn <- dplyr::mutate(trends_data_stn, Year = as.numeric(Year))
      
      trends_data_stn <- dplyr::mutate(trends_data_stn,
                                       Units= "Discharge (cms)",
                                       Units = replace(Units, grepl("Yield_mm", Statistic), "Yield (mm)"),
                                       Units = replace(Units, grepl("Volume_m3", Statistic), "Volume (cubic metres)"),
                                       Units = replace(Units, grepl("DoY", Statistic), ifelse(water_year_start == 1, 
                                                                                              "Day of Year", 
                                                                                              "Day of Water Year")),
                                       Units = replace(Units, grepl("Days", Statistic), "Number of Days"))
      
      
      # Loop through each statistic and plot the annual data, add trendline if < zyp_alpha
      for (stat in unique(trends_results_stn$Statistic)){
        # Filter for metric
        trends_data_stat <- dplyr::filter(trends_data_stn, Statistic == stat)
        trends_results_stat <- dplyr::filter(trends_results_stn, Statistic == stat)
        int <- trends_results_stat$intercept - trends_results_stat$trend * trends_results_stat$min_year
        # Plot each metric
        trends_plot <- ggplot2::ggplot(trends_data_stat, ggplot2::aes(x = Year, y = Value)) +
          ggplot2::geom_point(shape = 1, size = 2, colour = "darkblue", stroke = 2, na.rm = TRUE) +
          # ggplot2::geom_line(alpha = 0.3, na.rm = TRUE) +
          ggplot2::ggtitle(paste0(stat," (sig. = ", round(trends_results_stat$sig, 3), ")")) +
          #{if(length(unique(trends_results$STATION_NUMBER)) > 1) ggplot2::ggtitle(paste0(stn, ": ", stat,"   (Sig. = ", round(trends_results_stat$sig, 3), ")"))} +
          ggplot2::xlab(ifelse(water_year_start ==1, "Year", "Water Year"))+
          ggplot2::ylab(trends_data_stat$Units) +
          ggplot2::theme_bw() +
          ggplot2::scale_x_continuous(breaks = scales::pretty_breaks(n = 12)) +
          ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 6),
                                      labels = scales::label_number(scale_cut = append(scales::cut_short_scale(),1,1))) +
          ggplot2::theme(panel.border = ggplot2::element_rect(colour = "black", fill = NA, size = 1),
                         panel.grid = ggplot2::element_line(size = .2),
                         axis.title = ggplot2::element_text(size = 12),
                         axis.text = ggplot2::element_text(size = 10))
        # If sig. trend, plot trend
        if(!is.na(zyp_alpha) & trends_results_stat$sig < zyp_alpha & !is.na(trends_results_stat$sig)) {
          trends_plot <- trends_plot +
            ggplot2::geom_abline(slope = trends_results_stat$trend, intercept = int, colour = "red", linetype = "longdash")
        }
        
        if (length(unique(trends_results$STATION_NUMBER)) == 1) {
          plots_list[[ stat ]] <- trends_plot
        } else {
          plots_list[[ paste0(stn, "_", stat) ]] <- trends_plot
        }
      }
    }
  }
  
  
  # Recheck if station_number/grouping was in original flow_data and rename or remove as necessary
  if("STATION_NUMBER" %in% orig_cols) {
    names(trends_results)[names(trends_results) == "STATION_NUMBER"] <- as.character(substitute(groups))
    names(trends_data)[names(trends_data) == "STATION_NUMBER"] <- as.character(substitute(groups))
  } else {
    trends_results <- dplyr::select(trends_results, -STATION_NUMBER)
    trends_data <- dplyr::select(trends_data, -STATION_NUMBER)
  }
  
  # Create list of objects
  trends_list <- list("Annual_Trends_Data" = dplyr::as_tibble(trends_data),
                      "Annual_Trends_Results" = dplyr::as_tibble(trends_results))
  
  # Add plots to list
  if (include_plots) {
    trends_list <- append(trends_list, plots_list)
  }
  
  return(trends_list)
  
}