R/frequency_analysis.R

Defines functions mid_month last_day first_day daily_gwl_summary daily_frequency_table daily_gwl_plot weekly_frequency_plot weekly_frequency_table monthly_frequency_plot stats_by_interval monthly_frequency_table

Documented in daily_frequency_table daily_gwl_plot daily_gwl_summary first_day last_day mid_month monthly_frequency_plot monthly_frequency_table weekly_frequency_plot weekly_frequency_table

#' Create a table of monthly frequency analysis
#'
#' The table will accept daily, discrete, or a both types of data. The median of each
#' year/month is calculated. Then using that median, monthly stats are calculated.
#' Percentiles are calculated using the \code{quantile} function with "type=6".
#'
#'
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param flip logical. If \code{TRUE}, flips labels so that the lower numbers
#' are in the higher percentages. Default is \code{TRUE}.
#'
#' @return a data frame of monthly groundwater level statistics including the
#' 5th, 10th, 25th, 75th, 90th, and 95th percentiles; the number of
#' years of data; and the lowest monthly median and the highest monthly
#' median.
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' monthly_frequency <- monthly_frequency_table(gw_level_dv,
#'                                              NULL,
#'                                              parameter_cd = "62610")
#' head(monthly_frequency)
#' gwl_data <- L2701_example_data$Discrete
#'
#' monthly_frequency_combo <- monthly_frequency_table(gw_level_dv,
#'                                              gwl_data,
#'                                              parameter_cd = "62610")
#' head(monthly_frequency_combo)
#' monthly_flip <- monthly_frequency_table(gw_level_dv,
#'                                         gwl_data,
#'                                         parameter_cd = "62610",
#'                                         flip = TRUE)
#' head(monthly_flip)
monthly_frequency_table <- function(gw_level_dv,
                                    gwl_data,
                                    parameter_cd = NA,
                                    date_col = NA,
                                    value_col = NA,
                                    approved_col = NA,
                                    stat_cd = NA,
                                    flip = FALSE) {

  monthly_stats <- stats_by_interval(interval = "month",
                                     gw_level_dv = gw_level_dv,
                                     gwl_data = gwl_data,
                                     parameter_cd = parameter_cd,
                                     date_col = date_col,
                                     value_col = value_col,
                                     approved_col = approved_col,
                                     stat_cd = stat_cd,
                                     flip = flip)

  return(monthly_stats)

}


stats_by_interval <- function(interval,
                              gw_level_dv,
                              gwl_data,
                              parameter_cd = NA,
                              date_col = NA,
                              value_col = NA,
                              approved_col = NA,
                              stat_cd = NA,
                              flip = FALSE) {

  interval <- match.arg(interval,
                        choices = c("week", "month", "year"),
                        several.ok = FALSE)

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  gw_level_dv <- dplyr::bind_rows(gw_level_dv,
                                  gwl_data)

  gw_level_dv <- gw_level_dv[grepl("A", gw_level_dv$Approve), ]

  gw_level_dv$year <- as.POSIXlt(gw_level_dv$Date)$year + 1900
  gw_level_dv$month <- as.POSIXlt(gw_level_dv$Date)$mon + 1
  gw_level_dv$week <- as.POSIXlt(gw_level_dv$Date)$yday %/% 7 + 1

  gw_level_dv$interval <- gw_level_dv[[interval]]

  annual_stats <- gw_level_dv %>%
    dplyr::group_by(year, interval) %>%
    dplyr::summarize(median = stats::median(Value, na.rm = TRUE)) %>%
    dplyr::group_by(interval) %>%
    dplyr::summarize(minMed = min(median, na.rm = TRUE),
                     maxMed = max(median, na.rm = TRUE))

  stats <- gw_level_dv %>%
    dplyr::group_by(year, interval) %>%
    dplyr::summarize(median = stats::median(Value, na.rm = TRUE)) %>%
    dplyr::group_by(interval) %>%
    dplyr::summarize(p5_1 = quantile(median, probs = 0.05, type = 6, na.rm = TRUE),
                     p10_1 = quantile(median, probs = 0.1, type = 6, na.rm = TRUE),
                     p25_1 = quantile(median, probs = 0.25, type = 6, na.rm = TRUE),
                     p50 = quantile(median, probs = 0.5, type = 6, na.rm = TRUE),
                     p75_1 = quantile(median, probs = 0.75, type = 6, na.rm = TRUE),
                     p90_1 = quantile(median, probs = 0.9, type = 6, na.rm = TRUE),
                     p95_1 = quantile(median, probs = 0.95, type = 6, na.rm = TRUE),
                     nYears = length(unique(year))) %>%
    dplyr::left_join(annual_stats, by = "interval")

  if (flip) {
    names(stats)[names(stats) == "p5_1"] <- "p95"
    names(stats)[names(stats) == "p10_1"] <- "p90"
    names(stats)[names(stats) == "p25_1"] <- "p75"
    names(stats)[names(stats) == "p75_1"] <- "p25"
    names(stats)[names(stats) == "p90_1"] <- "p10"
    names(stats)[names(stats) == "p95_1"] <- "p05"
    names(stats)[names(stats) == "minMed"] <- "maxMedium"
    names(stats)[names(stats) == "maxMed"] <- "minMedium"
    names(stats)[names(stats) == "maxMedium"] <- "maxMed"
    names(stats)[names(stats) == "minMedium"] <- "minMed"
  } else {
    names(stats)[names(stats) == "p5_1"] <- "p05"
    names(stats)[names(stats) == "p10_1"] <- "p10"
    names(stats)[names(stats) == "p25_1"] <- "p25"
    names(stats)[names(stats) == "p75_1"] <- "p75"
    names(stats)[names(stats) == "p90_1"] <- "p90"
    names(stats)[names(stats) == "p95_1"] <- "p95"
  }

  stats <- stats[, c("interval",
                     "p05", "p10", "p25", "p50",
                     "p75", "p90", "p95",
                     "nYears", "minMed", "maxMed")]

  names(stats)[names(stats) == "interval"] <- interval

  return(stats)
}

#' Plot monthly frequency analysis
#'
#' This plot uses calculations from \code{monthly_frequency_table}. Daily, discrete,
#' or both types of data can be included.
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.
#' @param plot_range the time frame to use for the plot. Either "Past year" to use the
#' last year of data, or "Calendar year" to use the current calendar year, beginning
#' in January.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param plot_title the title to use on the plot.
#' @param subtitle character. Sub-title for plot, default is "U.S. Geological Survey".
#' @param y_axis_label the label used for the y-axis of the plot.
#' @param flip logical. If \code{TRUE}, flips the y axis so that the smallest number is on top.
#' Default is \code{TRUE}.
#' @param percentile_colors Optional argument to provide a vector of 5 or 7 colors
#' used to fill the percentile bars in order from lowest percentile bin to the
#' highest percentile bin. Default behavior (\code{NA}) is to use legacy plot colors. If 
#' include_edges parameter is set to TRUE, then this vector must be 7 colors long.
#' @param include_edges Optional argument to toggle on the "edge bins" min-5 and 95-max on the plot.
#' Default is FALSE which does not plot those bins.
#' @param median_point_size Optional argument to specify the size of the median point markers
#' which are shown as black triangles on the plot. The default size is 2.5.
#' @param data_point_size Optional argument to specify the size of the data point markers 
#' which are shown as red diamonds on the plot. The default size is 2.5.
#' @return a ggplot with rectangles representing the historical monthly percentile,
#' black triangles representing the historical monthly median, and red diamonds
#' showing the last year of groundwater level measurements.
#'
#' @import ggplot2
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' label <- dataRetrieval::readNWISpCode(p_code_dv)[["parameter_nm"]]
#' monthly_frequency <- monthly_frequency_plot(gw_level_dv,
#'                                             gwl_data = NULL,
#'                                             parameter_cd = "62610",
#'                                             plot_title = "L2701 Groundwater Level",
#'                                             y_axis_label = label,
#'                                             flip = FALSE)
#' monthly_frequency
#'
#' gwl_data <- L2701_example_data$Discrete
#'
#' monthly_frequency_plot(gw_level_dv,
#'                        gwl_data = gwl_data,
#'                        parameter_cd = "62610",
#'                        plot_title = "L2701 Groundwater Level",
#'                        y_axis_label = label,
#'                        flip = FALSE)
#'
#' monthly_frequency_flip <- monthly_frequency_plot(gw_level_dv,
#'                                                  gwl_data,
#'                                                  parameter_cd = "62610",
#'                                                  y_axis_label = label,
#'                                                  plot_title = "L2701 Groundwater Level",
#'                                                  flip = TRUE)
#' monthly_frequency_flip
#'
#' monthly_frequency_custom_colors <- monthly_frequency_plot(gw_level_dv,
#'                                                  gwl_data,
#'                                                  parameter_cd = "62610",
#'                                                  y_axis_label = label,
#'                                                  plot_title = "L2701 Groundwater Level",
#'                                                  flip = TRUE,
#'                                                  percentile_colors = c(
#'                                                      "red",
#'                                                      "yellow",
#'                                                      "green",
#'                                                      "blue",
#'                                                      "orange"
#'                                                  ))
#'                                                  
#' monthly_frequency_custom_colors
#' 
#' monthly_frequency_edge_bins <- monthly_frequency_plot(gw_level_dv,
#'                                                       gwl_data,
#'                                                       parameter_cd = "62610",
#'                                                       y_axis_label = label,
#'                                                       plot_title = "L2701 Groundwater Level",
#'                                                       flip = FALSE,
#'                                                       include_edges = TRUE)
#' monthly_frequency_edge_bins
#' 
#' monthly_frequency_custom_point_sizes <- monthly_frequency_plot(gw_level_dv,
#'                                                                gwl_data = gwl_data,
#'                                                                parameter_cd = "62610",
#'                                                                plot_title = "L2701 Groundwater Level",
#'                                                                y_axis_label = label,
#'                                                                median_point_size = 0.5,
#'                                                                data_point_size = 3)
#' monthly_frequency_custom_point_sizes 
#' 
monthly_frequency_plot <- function(gw_level_dv,
                                   gwl_data,
                                   parameter_cd = NA,
                                   date_col = NA,
                                   value_col = NA,
                                   approved_col = NA,
                                   stat_cd = NA,
                                   plot_title = "",
                                   subtitle = "U.S. Geological Survey",
                                   plot_range = c("Past year"),
                                   y_axis_label = "",
                                   flip = FALSE,
                                   percentile_colors = NA,
                                   include_edges = FALSE,
                                   median_point_size = 2.5,
                                   data_point_size = 2.5) {

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  plot_range <- match.arg(plot_range)

  date <- max(c(gw_level_dv$Date,
                gwl_data$Date),
              na.rm = TRUE)


  # Calculate the percentiles
  site_statistics <- monthly_frequency_table(gw_level_dv,
                                             gwl_data,
                                             parameter_cd = NA,
                                             date_col = c("Date", "Date"),
                                             value_col = c("Value", "Value"),
                                             approved_col = c("Approve", "Approve"),
                                             flip = flip)

  # Find the bounds of the plot.
  if (plot_range == "Past year") {
    plot_end <- last_day(date) + 1
    plot_start <- first_day(plot_end - 363)
  } else if (plot_range == "Calendar year") {
    calendar_year <- as.POSIXlt(date)$year + 1900
    plot_end <- as.Date(paste0(calendar_year, "-12-31"))
    plot_start <- as.Date(paste0(calendar_year, "-01-01"))
  }

  # The last year of groundwater level measurements will plot
  gw_level_dv <- dplyr::bind_rows(gw_level_dv,
                                  gwl_data)

  gw_level_dv <- gw_level_dv[gw_level_dv$Date >= plot_start &
                               gw_level_dv$Date <= plot_end, ]

  # Add the first day of the month to the site_statistics table for plotting
  plot_month <- seq(as.Date(plot_start), length = 12, by = "1 month")
  plot_month_lookup <- data.frame(plot_month = plot_month,
                                  month = as.POSIXlt(plot_month)$mon + 1)
  site_statistics <- dplyr::left_join(site_statistics,
                                      plot_month_lookup, by = "month")

  # set up default configurations for plot elements
  # Set up the plot data for the percentile ranges (rectangle geometry)
  site_statistics_pivot <- site_statistics %>%
    dplyr::select(-month, -nYears, -minMed, -maxMed) %>%
    tidyr::pivot_longer(cols = -plot_month,
                        names_to = "name",
                        values_to = "value")
  # set colors and groups
  cols <- list(c("p05", "p10"), c("p10", "p25"), c("p25", "p75"),
               c("p75", "p90"), c("p90", "p95"))
  groups <- c("5 - 10", "10 - 25", "25 - 75", "75 - 90", "90 - 95")
  # define default colors

  color_list <- c("darkred", "orange2", "green2", "steelblue1", "blue")
  
  if(include_edges){
    color_list <- c("firebrick3", color_list, "darkblue")
  } 
  
  # custom colors and shapes
  if (length(percentile_colors) >= 5) {
    color_list <- percentile_colors
  } else if (is.na(percentile_colors) == FALSE) {
    warning(
      paste0(
        "percentile_colors argument was provided but was invalid,",
        " should be a vector of length 5 or 7 in which each item in",
        " the vector represents a color."
      )
    )
  }
  
  # set scale breaks
  scale_breaks <- c("90 - 95",
                    "75 - 90",
                    "25 - 75",
                    "10 - 25",
                    "5 - 10")
  
  # updates if the edges are to be included on the plot
  if (include_edges) {
    # alternative pivot to include min/max values
    site_statistics_pivot <- site_statistics %>%
      dplyr::select(-month, -nYears) %>%
      tidyr::pivot_longer(cols = -plot_month,
                          names_to = "name",
                          values_to = "value")
    # colors and groups
    cols <- append(list(c("minMed", "p05")), cols)
    cols <- append(cols, list(c("p95", "maxMed")))
    groups <- c("0 - 5", groups, "95 - 100")
    # expand color list
    
    # expand rectangle colors
    rectangle_colors <- c("0 - 5" = color_list[1],
                          "5 - 10" = color_list[2],
                          "10 - 25" = color_list[3],
                          "25 - 75" = color_list[4],
                          "75 - 90" = color_list[5],
                          "90 - 95" = color_list[6],
                          "95 - 100" = color_list[7])
    # expand the scale breaks
    scale_breaks <- c("95 - 100",
                      scale_breaks,
                      "0 - 5")
  } else {
    # set plot colors and markers
    rectangle_colors <- c("5 - 10" = color_list[1],
                          "10 - 25" = color_list[2],
                          "25 - 75" = color_list[3],
                          "75 - 90" = color_list[4],
                          "90 - 95" = color_list[5])
  }
  
  plot_list <- data.frame()

  for (i in seq_along(cols)) {
    plot_data <- site_statistics_pivot %>%
      dplyr::filter(name %in% cols[[i]]) %>%
      tidyr::pivot_wider(id_cols = plot_month,
                         names_from = name,
                         values_from = value) %>%
      dplyr::rename(ymin = cols[[i]][1], ymax = cols[[i]][2]) %>%
      dplyr::mutate(group = groups[i])

    plot_list <- dplyr::bind_rows(plot_list, plot_data)
  }

  # Make the group an ordered factor so the legend has the correct order
  # and add the last day of the month to draw the rectangles
  site_statistics_plot <- plot_list
  site_statistics_plot$group <- factor(site_statistics_plot$group,
                                       levels = groups,
                                       ordered = TRUE)

  site_statistics_plot$plot_month_last <- last_day(site_statistics_plot$plot_month) + 1


  # The median value will plot in the middle of the month
  site_statistics_med <- site_statistics
  site_statistics_med$group <- "Monthly median"
  site_statistics_med$plot_month_med <- mid_month(site_statistics_med$plot_month)

  site_statistics_med <- site_statistics_med[, c("plot_month_med", "p50", "group")]
  names(site_statistics_med) <- c("month", "value", "group")
  points_plot <- gw_level_dv

  if (nrow(gw_level_dv) > 0) {
    points_plot$group <- "Data point"
    points_plot <- points_plot[, c("Date", "Value", "group")]
    names(points_plot) <- c("month", "value", "group")
    points_plot <- dplyr::bind_rows(site_statistics_med,
                                    points_plot)
  }

  point_shapes <- c("Monthly median" = 17,
                    "Data point" = 18)
  point_colors <- c("Monthly median" = "black",
                    "Data point" = "red")
  point_sizes <- c("Monthly median" = median_point_size,
                   "Data point" = data_point_size)

  # Create the plot labels
  start_year <- as.POSIXlt(plot_start)$year + 1900
  end_year <- as.POSIXlt(plot_end)$year + 1900
  if (start_year == end_year) {
    x_label <- as.character(start_year)
  } else {
    x_label <- paste(start_year, end_year, sep = " - ")
  }
  y_label <- y_axis_label

  # Plot
  plot_out <- ggplot() +
    geom_rect(data = site_statistics_plot,
              aes(xmin = plot_month,
                  xmax = plot_month_last,
                  ymin = ymin,
                  ymax = ymax,
                  fill = group),
              color = "black") +
    geom_vline(xintercept = plot_month)

  if (nrow(points_plot) > 0) {
    plot_out <- plot_out +
      geom_point(data = points_plot,
                 aes(
                   x = month,
                   y = value,
                   shape = group,
                   color = group,
                   size = group
                 ))
  }
  
  # make plot
  plot_out <- plot_out +
    scale_fill_manual(values = rectangle_colors,
                      name = "Percentile",
                      breaks = scale_breaks) +
    scale_shape_manual(name = "EXPLANATION",
                       values = point_shapes, 
                       breaks = names(point_shapes)) +
    scale_color_manual(name = "EXPLANATION",
                       values = point_colors, 
                       breaks = names(point_colors)) +
    scale_size_manual(values = point_sizes,
                      guide = "none") + 
    scale_x_date(limits = c(plot_start, plot_end + 1), expand = c(0, 0),
                 breaks = mid_month(plot_month),
                 labels = month.abb[as.POSIXlt(plot_month)$mon + 1]) +
    hasp_framework(x_label = x_label,
                   y_label = y_label,
                   plot_title = plot_title,
                   subtitle = subtitle) +
    theme(axis.ticks.x = element_blank()) +
    guides(shape = guide_legend(order = 1),
           color = guide_legend(order = 1),
           fill = guide_legend(order = 2))

  if (flip) {
    plot_out <- plot_out +
      scale_y_continuous(trans = "reverse")
  }

  return(plot_out)
}

#' Create a table of weekly frequency analysis
#'
#' The weekly frequency analysis is based on daily, discrete, or both types of
#' data. The median of each year/week combo is calculated, then overall weekly
#' statistics are calculated off of that median.
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.
#' @param flip logical. If \code{TRUE}, flips labels so that the lower numbers
#' are in the higher percentages. Default is \code{TRUE}.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#'
#' @return a data frame of weekly frequency analysis
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' weekly_frequency <- weekly_frequency_table(gw_level_dv,
#'                                            NULL,
#'                                            parameter_cd = "62610")
#' head(weekly_frequency)
#'
#' gwl_data <- L2701_example_data$Discrete
#'
#' weekly_frequency <- weekly_frequency_table(gw_level_dv,
#'                                            gwl_data,
#'                                            parameter_cd = "62610")
#' weekly_frequency
#' weekly_flip <- weekly_frequency_table(gw_level_dv,
#'                                       gwl_data,
#'                                       parameter_cd = "62610",
#'                                       flip = TRUE)
#' weekly_flip
weekly_frequency_table <- function(gw_level_dv,
                                   gwl_data,
                                   parameter_cd = NA,
                                   date_col = NA,
                                   value_col = NA,
                                   approved_col = NA,
                                   stat_cd = NA,
                                   flip = FALSE) {

  weekly_stats <- stats_by_interval(interval = "week",
                                     gw_level_dv = gw_level_dv,
                                     gwl_data = gwl_data,
                                     parameter_cd = parameter_cd,
                                     date_col = date_col,
                                     value_col = value_col,
                                     approved_col = approved_col,
                                     stat_cd = stat_cd,
                                     flip = flip)

  weekly_stats$week_start <- format(as.Date(paste(weekly_stats$week - 1, 1, sep = "-"),
                                    "%U-%u"), "%m-%d")

  return(weekly_stats)

}

#' Plot weekly frequency analysis
#'
#' Weekly statistics are calculated using the \code{weekly_frequency_table} function.
#' Daily, discrete, or both types of data can be used.
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.
#' @param plot_range the time frame to use for the plot. Either "Past year" to use the
#' last year of data, or "Calendar year" to use the current calendar year, beginning
#' in January.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param plot_title the title to use on the plot
#' @param subtitle character. Sub-title for plot, default is "U.S. Geological Survey".
#' @param y_axis_label the label used for the y-axis of the plot.
#' @param flip logical. If \code{TRUE}, flips the y axis so that the smallest number is on top.
#' Default is \code{FALSE}.
#' @param percentile_colors Optional argument to provide a vector of 5 colors
#' used to fill the percentile bars in order from lowest percentile bin to the
#' highest percentile bin. Default behavior (NA) is to use legacy plot colors.
#' @return a ggplot object with rectangles representing the historical weekly percentiles,
#' and points representing the historical median and daily values
#'
#' @import ggplot2
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' weekly_frequency_plot(gw_level_dv,
#'                       gwl_data = NULL,
#'                       date_col = "Date",
#'                       value_col = "X_62610_00001",
#'                       approved_col = "X_62610_00001_cd")
#'
#' gwl_data <- L2701_example_data$Discrete
#'
#' weekly_frequency_plot(gw_level_dv,
#'                       gwl_data = gwl_data,
#'                       parameter_cd = "62610")
#'
#' weekly_frequency_plot(gw_level_dv,
#'                       gwl_data = gwl_data,
#'                       parameter_cd = "62610",
#'                       flip = TRUE)
#'
weekly_frequency_plot <- function(gw_level_dv,
                                  gwl_data,
                                  parameter_cd = NA,
                                  date_col = NA,
                                  value_col = NA,
                                  approved_col = NA,
                                  stat_cd = NA,
                                  plot_range = "Past year",
                                  plot_title = "",
                                  subtitle = "U.S. Geological Survey",
                                  y_axis_label = "",
                                  flip = FALSE,
                                  percentile_colors = NA) {

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  plot_range <- match.arg(plot_range, choices = c("Past year",
                                                  "Calendar year"),
                          several.ok = FALSE)

  date <- max(c(gw_level_dv$Date,
                gwl_data$Date),
              na.rm = TRUE)

  # Calculate the percentiles
  site_statistics <- weekly_frequency_table(gw_level_dv,
                                            gwl_data,
                                            date_col = c("Date", "Date"),
                                            value_col = c("Value", "Value"),
                                            approved_col = c("Approve", "Approve"),
                                            flip = flip)

  # Find the bounds of the plot
  if (plot_range == "Past year") {
    plot_end <- last_day(date) + 1
    plot_start <- first_day(plot_end - 363)
  } else if (plot_range == "Calendar year") {
    calendar_year <- format(date, format = "%Y")
    plot_end <- as.Date(paste0(calendar_year, "-12-31"))
    plot_start <- as.Date(paste0(calendar_year, "-01-01"))
  }

  # The last year of groundwater level measurements will plot
  gw_level_plot <- gw_level_dv %>%
    dplyr::filter(Date >= plot_start)

  # Add the first day of the week to the site_statistics table for plotting
  plot_week <- seq(as.Date(plot_start), length = 52, by = "1 week")
  plot_week_lookup <- data.frame(plot_week = plot_week,
                                 week = as.POSIXlt(plot_week)$yday %/% 7 + 1)

  site_statistics <- dplyr::left_join(site_statistics,
                                      plot_week_lookup,
                                      by = "week")

  # Set up the plot data for the percentile ranges (rectangle geometry)
  site_statistics_pivot <- site_statistics %>%
    dplyr::select(-week, -nYears, -minMed, -maxMed, -week_start) %>%
    tidyr::pivot_longer(cols = -plot_week,
                        names_to = "name",
                        values_to = "value")

  cols <- list(c("p05", "p10"), c("p10", "p25"), c("p25", "p75"),
               c("p75", "p90"), c("p90", "p95"))
  groups <- c("5 - 10", "10 - 25", "25 - 75", "75 - 90", "90 - 95")
  plot_list <- data.frame()
  for (i in seq_along(cols)) {
    plot_data <- site_statistics_pivot %>%
      dplyr::filter(name %in% cols[[i]]) %>%
      tidyr::pivot_wider(id_cols = plot_week, names_from = name, values_from = value) %>%
      dplyr::rename(ymin = cols[[i]][1], ymax = cols[[i]][2]) %>%
      dplyr::mutate(group = groups[i])
    plot_list <- dplyr::bind_rows(plot_list, plot_data)
  }

  # Make the group an ordered factor so the legend has the correct order
  # and add the last day of the month to draw the rectangles
  site_statistics_plot <- plot_list %>%
    dplyr::mutate(group = factor(group,
                          levels = groups,
                          ordered = TRUE),
           plot_week_last = plot_week + 7)

  # The median value will plot in the middle of the month
  site_statistics_med <- site_statistics %>%
    dplyr::mutate(plot_week_med = plot_week + 3,
           group = "Historical weekly median") %>%
    dplyr::select(plot_week_med, p50, group) %>%
    dplyr::rename(x = plot_week_med, y = p50)

  data_points <- gw_level_plot %>%
    dplyr::mutate(gw_code = ifelse(grepl("A", Approve), "Approved", "Provisional"),
           group = sprintf("%s daily value", gw_code)) %>%
    dplyr::rename(x = Date,
           y = Value) %>%
    dplyr::select(x, y, group)

  point_data <- dplyr::bind_rows(site_statistics_med, data_points) %>%
    dplyr::mutate(group = factor(group,
                          levels = c("Historical weekly median",
                                     "Approved daily value",
                                     "Provisional daily value"),
                          ordered = TRUE))

  # Assign colors and shapes
  groups <- c("5 - 10", "10 - 25", "25 - 75", "75 - 90", "90 - 95")
  color_list <- c("palevioletred2", "rosybrown1", "darkolivegreen1", "lightskyblue2",
                                  "skyblue3")
                                  
  if (length(percentile_colors) >= 5) {
    color_list <- percentile_colors
  }
  else if (is.na(percentile_colors) == FALSE) {
    warning(paste0("percentile_colors argument was provided but was invalid,",
                   " should be a vector of length 5 in which each item in",
                   " the vector represents a color."))
  }
  
  rectangle_colors <- c(`5 - 10` = color_list[1], `10 - 25` = color_list[2],
                        `25 - 75` = color_list[3], `75 - 90` = color_list[4],
                        `90 - 95` = color_list[5])
  point_shapes <- c("Historical weekly median" = 17,
                    "Provisional daily value" = 16,
                    "Approved daily value" = 16)
  point_colors <- c("Approved daily value" = "black",
                    "Provisional daily value" = "red",
                    "Historical weekly median" = "springgreen4")

  point_shapes <- point_shapes[as.character(unique(point_data$group))]
  point_colors <- point_colors[as.character(unique(point_data$group))]

  # Create the plot labels
  year_start <- as.POSIXlt(plot_start)$year + 1900
  year_end <- as.POSIXlt(plot_end)$year + 1900

  if (year_start == year_end) {
    x_label <- as.character(year_start)
  } else {
    x_label <- paste(year_start, year_end, sep = " - ")
  }
  y_label <- y_axis_label

  # Create the month breaks
  month_start <- seq(as.Date(plot_start), length = 12, by = "1 month")
  month_breaks <- mid_month(month_start)
  month_labels <- month.abb[as.POSIXlt(month_breaks)$mon + 1]

  order_groups <- c("Approved daily value",
                    "Provisional daily value",
                    "Historical weekly median")

  point_data$group <- factor(point_data$group,
                             levels = order_groups)
  point_data$group <- droplevels(point_data$group)
  # Plot
  plot_out <- ggplot() +
    geom_rect(data = site_statistics_plot,
              aes(xmin = plot_week,
                  xmax = plot_week_last,
                  ymin = ymin,
                  ymax = ymax,
                  fill = group)) +
    geom_vline(xintercept = plot_week, color = "gray90") +
    geom_point(data = dplyr::filter(point_data,
                                    group == "Historical weekly median"),
               aes(x = x, y = y, color = group),
               size = 1, shape = 17) +
    geom_line(data = dplyr::filter(point_data,
                                   group != "Historical weekly median"),
               aes(x = x, y = y, color = group), linewidth = 1) +
    geom_vline(xintercept = month_start, color = "grey70") +
    scale_color_manual(values = point_colors, breaks = order_groups,
                       name = "EXPLANATION") +
    scale_shape_manual(values = c(17, NA, NA), name = "EXPLANATION") +
    scale_fill_manual(values = rectangle_colors,
                      breaks = c("90 - 95",
                                 "75 - 90",
                                 "25 - 75",
                                 "10 - 25",
                                 "5 - 10"),
                      name = "Percentile") +
    scale_x_date(limits = c(plot_start, plot_end + 1), expand = c(0, 0),
                 breaks = month_breaks, labels = month_labels) +
    hasp_framework(x_label, y_label,
                   plot_title = plot_title,
                   subtitle = subtitle) +
    theme(axis.ticks.x = element_blank(),
          aspect.ratio = NULL)

  if (length(unique(point_data$group)) == 3) {
    plot_out <- plot_out +
      guides(color = guide_legend(order = 1,
                                override.aes = list(shape = c(NA, NA, 17),
                                                    linetype = c("solid", "solid", "blank"))),
           shape = "none",
           fill = guide_legend(order = 2))
  } else if (length(unique(point_data$group)) == 2) {
    #TODO: be smarter:
    plot_out <- plot_out +
      guides(color = guide_legend(order = 1,
                                  override.aes = list(shape = c(NA, 17),
                                                      linetype = c("solid",  "blank"))),
             shape = "none",
             fill = guide_legend(order = 2))
  }

  if (flip) {
    plot_out <- plot_out +
      scale_y_continuous(trans = "reverse")
  }

  return(plot_out)

}

#' Plot recent data
#'
#' Calculates daily statistics based on all approved data.
#' Daily, discrete, or both types are included.
#' Historic median or mean are plotted based on all of the approved data.
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param start_date Date to start plot. If \code{NA} (which is the default),
#' the plot will start 2 years before the most recent value.
#' @param end_date Date to end plot. If \code{NA} (which is the default),
#' the plot will end with the latest measurement.
#' @param historical_stat the summary statistic to use for middle line of the plot. Either
#' "mean" or "median."
#' @param month_breaks a logical indicating whether to use monthly breaks for the plot
#' @param plot_title the title to use on the plot
#' @param subtitle character. Sub-title for plot, default is "U.S. Geological Survey".
#' @param y_axis_label the label to use for the y axis
#' @param flip logical. If \code{TRUE}, flips the y axis so that the smallest number is on top.
#' Default is \code{FALSE}.
#' @return a ggplot object with a ribbon indicating the historical daily range,
#' the historical daily mean or median, and approved and provisional
#' daily data for the last two years
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'
#' site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#'
#' gwl_data <- L2701_example_data$Discrete
#'
#' daily_gwl_plot(gw_level_dv,
#'                NULL,
#'                parameter_cd = "62610",
#'                plot_title = "Groundwater Level",
#'                historical_stat = "median")
#'
#' daily_gwl_plot(gw_level_dv,
#'                gwl_data,
#'                parameter_cd = "62610",
#'                plot_title = "Groundwater Level",
#'                historical_stat = "median")
#'
#' daily_gwl_plot(gw_level_dv,
#'                gwl_data,
#'                parameter_cd = "62610",
#'                plot_title = "Groundwater Level",
#'                month_breaks = TRUE,
#'                start_date = "2020-10-01",
#'                historical_stat = "median")
#'
#' daily_gwl_plot(gw_level_dv, gwl_data,
#'                parameter_cd = "62610",
#'                plot_title = "Groundwater Level",
#'                month_breaks = TRUE,
#'                start_date = "2018-10-01",
#'                end_date = "2020-10-01",
#'                historical_stat = "median")
#'
daily_gwl_plot <- function(gw_level_dv,
                           gwl_data,
                           parameter_cd = NA,
                           date_col = NA,
                           value_col = NA,
                           approved_col = NA,
                           stat_cd = NA,
                           start_date = NA,
                           end_date = NA,
                           historical_stat = "mean",
                           month_breaks = FALSE,
                           plot_title = "",
                           subtitle = "U.S. Geological Survey",
                           y_axis_label = "",
                           flip = FALSE) {

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  gw_level_dv <- dplyr::bind_rows(gw_level_dv,
                                  gwl_data)

  historical_stat <- match.arg(historical_stat,
                               choices = c("mean", "median"),
                               several.ok = FALSE)
  historical_function <- switch(historical_stat,
                                "median" = median,
                                "mean" = mean)
  historical_name <- paste("Historical", historical_stat)

  # Calculate the historical max/min/median for each day

  gw_level_dv$J <- as.numeric(format(gw_level_dv$Date,
                                     format = "%j"))

  historical_stats <- gw_level_dv[grepl("A", gw_level_dv$Approve), ] %>%
    dplyr::group_by(J) %>%
    dplyr::summarize(max = max(Value, na.rm = TRUE),
                     middle = historical_function(Value, na.rm = TRUE),
                     min = min(Value, na.rm = TRUE))

  # Pull the last two years of data & join with the historical data

  if (is.na(end_date)) {
    end_date <- max(c(gw_level_dv$Date), na.rm = TRUE)
  } else {
    end_date <- as.Date(end_date)
  }

  if (is.na(start_date)) {
    plot_start_year <- as.numeric(format(end_date, format = "%Y")) - 2
    plot_start <- as.Date(paste(plot_start_year,
                                format(end_date, format = "%m-%d"),
                                sep = "-"))
  } else {
    plot_start <- as.Date(start_date)
  }


  # add a 10 day buffer following the most recent value
  plot_end <- end_date + as.difftime(10, units = "days")
  buffer_dates <- seq.Date(plot_start, plot_end, by = "day")[-1]
  buffer_dates <- buffer_dates[buffer_dates <= Sys.Date()]
  buffer_j <- as.numeric(format(buffer_dates, "%j"))
  buffer <- stats::setNames(data.frame(buffer_dates, buffer_j),
                            c("Date", "J"))

  plot_data <- gw_level_dv %>%
    dplyr::right_join(buffer, by = c("Date", "J")) %>%
    dplyr::left_join(historical_stats, by = "J") %>%
    dplyr::mutate(group = "Approved Daily\nMin & Max")

  line_data <- plot_data %>%
    dplyr::select(Date, Approve, Value, middle) %>%
    tidyr::pivot_longer(-Date:-Approve) %>%
    dplyr::mutate(group = ifelse(name == "Value",
                          ifelse(Approve == "A", "Approved daily value", "Provisional daily value"),
                          historical_name)) %>%
    dplyr::select(-Approve, -name) %>%
    dplyr::filter(!is.na(value))

  line_data$group <- ordered(line_data$group,
                             levels = c("Approved daily value",
                                        "Provisional daily value",
                                        historical_name))

  # Create the plot

  line_colors <- c("limegreen",
                   "Provisional daily value" = "red",
                   "Approved daily value" = "navy")
  names(line_colors)[1] <- historical_name
  ribbon_colors <- c("Approved Daily\nMin & Max" = "lightskyblue1")

  if (month_breaks) {
    x_label <- paste(format(plot_start, "%B %Y"),
                     "to",
                     format(plot_end, "%B %Y"))
    x_breaks <- mid_month(seq.Date(plot_start, plot_end, by = "month"))
    x_tick_labels <- substr(format(x_breaks, format = "%B"), 1, 1)
  } else {
    x_label <- "Date"
    x_breaks <- seq.Date(plot_start, end_date, by = "year")
    x_tick_labels <- format(x_breaks, format = "%Y")
  }

  y_label <- y_axis_label

  plot_out <- ggplot() +
    geom_ribbon(data = plot_data,
                aes(x = Date, ymin = min, ymax = max, fill = group)) +
    geom_line(data = line_data,
              aes(x = Date, y = value, color = group)) +
    scale_color_manual(values = line_colors, name = "EXPLANATION") +
    scale_fill_manual(values = ribbon_colors, name = "") +
    hasp_framework(x_label, y_label,
                   plot_title = plot_title,
                   subtitle = subtitle) +
    theme(aspect.ratio = NULL) +
    scale_x_date(limits = c(plot_start, plot_end),
                 expand = c(0, 0),
                 breaks = x_breaks, labels = x_tick_labels) +
    guides(color = guide_legend(order = 1),
           fill = guide_legend(order = 2))

  if (month_breaks) {
    plot_out <- plot_out +
      geom_vline(xintercept = seq.Date(plot_start, plot_end, by = "month"),
                 color = "grey80") +
      theme(axis.ticks.x = element_blank())

  }

  if (flip) {
    plot_out <- plot_out +
      scale_y_continuous(trans = "reverse")
  }

  return(plot_out)

}

#' Daily frequency table
#'
#' Calculates the historical max, mean, minimum, and number of available points
#' for each day of the year
#'
#' @param gw_level_dv data frame, daily groundwater level data
#' from \code{readNWISdv}
#' @param gwl_data data frame returned from dataRetrieval::readNWISgwl, or
#' data frame with mandatory columns lev_dt (representing date), lev_age_cd (representing
#' approval code), and a column representing the measured value (either lev_va,
#' sl_lev_va, or value).
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col.
#'  If the data doesn't come directly from NWIS services, this
#' can be set to \code{NA},and this argument will be ignored.
#' @param date_col the heading of the date column. The default is \code{NA},
#' which the code will try to get the column name automatically.
#' @param value_col name of value column. The default is \code{NA},
#' which the code will try to get the column name automatically.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param approved_col name of column to get provisional/approved status.
#'
#' @return a data frame giving the max, mean, min, and number of available
#' days of data for each day of the year.
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' daily_frequency_table(gw_level_dv,
#'                       NULL,
#'                       parameter_cd = "62610")
#'
#' gwl_data <- L2701_example_data$Discrete
#' daily_frequency_table(gw_level_dv,
#'                       gwl_data,
#'                       parameter_cd = "62610")
daily_frequency_table <- function(gw_level_dv,
                                  gwl_data,
                                  parameter_cd = NA,
                                  date_col = NA, value_col = NA,
                                  approved_col = NA, stat_cd = NA) {

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  gw_level_dv <- dplyr::bind_rows(gw_level_dv,
                                  gwl_data)

  historical_stats <- gw_level_dv[grepl("A", gw_level_dv$Approve), ]
  historical_stats$DOY <- as.numeric(format(historical_stats$Date, "%j"))

  historical_stats <- historical_stats %>%
    dplyr::group_by(DOY) %>%
    dplyr::summarize(max = max(Value, na.rm = TRUE),
              mean = mean(Value, na.rm = TRUE),
              min = min(Value, na.rm = TRUE),
              points = dplyr::n())
  return(historical_stats)

}

#' Summary table of daily data
#'
#' @param gw_level_dv data frame, daily groundwater level data. Often obtained
#' from \code{\link[dataRetrieval]{readNWISdv}}. Use \code{NULL} for no daily data.
#' @param gwl_data data frame returned from  \code{\link[dataRetrieval]{readNWISgwl}}, or
#' data frame with a date, value, and approval columns. Using the convention:
#' lev_dt (representing date), lev_age_cd (representing approval code), and lev_va
#' or sl_lev_va (representing value) will allow defaults to work.
#' Use \code{NULL} for no discrete data.
#' @param parameter_cd If data in gw_level_dv comes from NWIS, the parameter_cd
#' can be used to define the value_col. If the data doesn't come directly from
#' NWIS services, this can be set to \code{NA},and this argument will be ignored.
#' @param date_col the name of the date column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the date
#' column for gw_level_dv, and the second defines gwl_data.
#' @param value_col the name of the value column. The default is \code{NA},
#' in which case, the code will try to get the column name automatically based on NWIS
#' naming conventions. If both gw_level_dv and gwl_data data frames
#' require custom column names, the first value of this input defines the value
#' column for gw_level_dv, and the second defines gwl_data.
#' @param approved_col the name of the column to get provisional/approved status.
#' The default is \code{NA}, in which case, the code will try to get the column name
#' automatically based on NWIS naming conventions. If both gw_level_dv and
#' gwl_data data frames require custom column names, the first value of this
#' input defines the approval column for gw_level_dv, and the second defines gwl_data.#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#' @param stat_cd If data in gw_level_dv comes from NWIS, the stat_cd
#' can be used to help define the value_col.
#'
#' @return a summary table giving the period of record, completeness
#' and percentile values
#'
#' @export
#'
#' @examples
#'
#' # site <- "263819081585801"
#' p_code_dv <- "62610"
#' statCd <- "00001"
#' # gw_level_dv <- dataRetrieval::readNWISdv(site, p_code_dv, statCd = statCd)
#' gw_level_dv <- L2701_example_data$Daily
#' daily_gwl_summary(gw_level_dv,
#'                   gwl_data = NULL,
#'                   parameter_cd = p_code_dv)
#'
#' gwl_data <- L2701_example_data$Discrete
#' daily_gwl_summary(gw_level_dv,
#'                   gwl_data = gwl_data,
#'                   parameter_cd = p_code_dv)
daily_gwl_summary <- function(gw_level_dv,
                              gwl_data,
                              parameter_cd = NA,
                              date_col = NA,
                              value_col = NA,
                              approved_col = NA,
                              stat_cd = NA) {

  data_list <- set_up_data(gw_level_dv = gw_level_dv,
                           gwl_data = gwl_data,
                           parameter_cd = parameter_cd,
                           date_col = date_col,
                           value_col = value_col,
                           approved_col = approved_col,
                           stat_cd = stat_cd)

  gw_level_dv <- data_list$gw_level_dv
  gwl_data <- data_list$gwl_data

  gw_level_dv <- dplyr::bind_rows(gw_level_dv,
                                  gwl_data)

  gw_level_dv$gw_level <- gw_level_dv$Value
  gw_level_dv$gw_level_cd <- gw_level_dv$Approve

  gw_level_dv <- gw_level_dv[grepl("A", gw_level_dv$gw_level_cd), ]

  begin_date <- min(gw_level_dv$Date, na.rm = TRUE)
  end_date <- max(gw_level_dv$Date, na.rm = TRUE)
  days <- nrow(gw_level_dv)
  percent_complete <- round(days / length(seq.Date(begin_date, end_date, by = "day")) * 100, 0)
  lowest_level <- min(gw_level_dv$gw_level, na.rm = TRUE)
  highest_level <- max(gw_level_dv$gw_level, na.rm = TRUE)
  quant <- quantile(gw_level_dv$gw_level, type = 6,
                    probs = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95),
                    na.rm = TRUE)
  dv_summary <- data.frame(
    begin_date = begin_date,
    end_date = end_date,
    days = days,
    percent_complete = percent_complete,
    lowest_level = lowest_level,
    p5 = quant[1],
    p10 = quant[2],
    p25 = quant[3],
    p50 = quant[4],
    p75 = quant[5],
    p90 = quant[6],
    p95 = quant[7],
    highest_level = highest_level,
    row.names = NULL)

  return(dv_summary)

}

#' Find the first day of the month for a given date
#'
#' @param date a vector of dates
#'
#' @return the first day of the month that given dates fall in
#' @export
#'
#' @examples
#' date <- as.Date("2020-12-28")
#' first_day(date)
#'
first_day <- function(date) {

  date <- as.POSIXlt(date)

  first_day_month <- as.Date(paste(
    date$year + 1900,
    date$mon + 1,
    1,
    sep = "-"
  ))

  return(first_day_month)

}

#' Find the last day of the month for a given date
#'
#' @param date a vector of dates
#'
#' @return the last day of the month that given dates fall in
#' @export
#' @examples
#' date <- as.Date("2020-12-28")
#' last_day(date)
#' last_day("2020-02-15")
#' last_day("2019-02-15")
#' last_day(c("2020-12-28", "2020-02-15", "2019-02-15"))
last_day <- function(date) {

  date <- as.POSIXlt(date)

  year <- date$year + 1900
  month <- date$mon + 1

  is_leap <- as.numeric((year %% 4 == 0 & year %% 100 != 0) |
                          year %% 400 == 0)

  total_day <- c(31, 28,
                 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)

  last_day_month <- as.Date(paste(
    year,
    month,
    total_day[month],
    sep = "-"
  ))

  if (any(is_leap & month == 2)) {
    last_day_month[is_leap & month == 2] <- as.Date(paste(year[is_leap & month == 2],
                                                          "02-29",
                                                          sep = "-"))
  }


  return(last_day_month)

}


#' Find the middle of the month for a given date
#'
#' @param date a vector of dates
#'
#' @return the middle day of the month the given dates fall in
#' @export
#' @examples
#' date <- as.Date("2020-12-28")
#' mid_month(date)
#' mid_month(c("2019-02-15", "2020-03-08", "2010-06-01"))
mid_month <- function(date) {

  last_days <- last_day(date)
  first_days <- first_day(date)

  mid <- first_days + difftime(last_days, first_days) / 2

  return(mid)

}
USGS-R/HASP documentation built on July 28, 2024, 7:53 a.m.