macrosheds: Tools for interfacing with the MacroSheds dataset

Documented in ms_calc_vwc

#' Calculates chemical fluxes
#'
#' Calculates stream discharge and precipitation fluxes of chemicals from Q (discharge
#' or precipitation) and chemistry data.
#'
#' @keywords internal
#' @author Spencer Rhea 
#' @author Mike Vlah, \email{vlahm13@@gmail.com}
#' @author Wes Slaughter
#' @param chemistry \code{data.frame}. A \code{data.frame} of precipitation or
#'    stream chemistry data in MacroSheds format and in units of mg/L.
#' @param q \code{data.frame}. A \code{data.frame} of precipitation or stream
#'    discharge in MacroSheds format and in units of mm or L/s, respectively.
#' @param q_type character. Either 'precipitation' or 'discharge'.
#' @param verbose logical. Default TRUE; prints more information to console.
#' @return returns a \code{tibble} of stream or precipitation chemical flux for every timestep
#'    where discharge/precipitation and chemistry are reported. Output units are kg/ha/timestep.
#' @details
#' Chemical Volume Weighted Concentration (VWC) is 'traditionally' calculated as
#' VWC = sum(concentrations * volumes) / sum(volumes) over a given period. Because we have discharge
#' (a rate), rather than volume, we could scale it to the sample interval and then use the equation
#' above. For simplicity, we use this instead, which works out the same:
#' VWC = mean(concentrations * discharges) / mean(discharges)
#' Note also that we're starting from flux here, for computational efficiency, so the "concentrations * volumes"
#' part is already computed, and it's a rate too. That means the modified equation is:
#' VWC = mean(fluxes) / mean(discharges). There are two minor introductions of
#' error. The first is that we don't omit discharge values from the denominator that lack a corresponding
#' flux value (this is expensive, and not doing it should rarely skew the results). The second is that a "month"
#' is considered 30 days here, which only affects filtering by coverage.
#' @seealso [ms_calc_flux], [ms_synchronize_timestep()], [ms_conversions()], [ms_scale_flux_by_area()], [ms_undo_scale_flux_by_area()]
#' @examples
#' #' ### Load some MacroSheds data:
#' ms_root = 'data/macrosheds'
#' ms_download_core_data(macrosheds_root = ms_root,
#'                       domains = 'hbef')
#'
#' chemistry <- ms_load_product(macrosheds_root = ms_root,
#'                              prodname = 'stream_chemistry',
#'                              site_codes = c('w1', 'w3', 'w6'),
#'                              filter_vars = c('NO3_N', 'Cl', 'Na'))
#'
#' q <- ms_load_product(macrosheds_root = ms_root,
#'                      prodname = 'discharge',
#'                      site_codes = c('w1', 'w3', 'w6'))
#'
#' flux <- ms_calc_vwc(chemistry = chemistry,
#'                      q = q,
#'                      q_type = 'discharge')

ms_calc_vwc <- function(chemistry, q, q_type, agg = "yearly", verbose = TRUE) {

    library("dplyr", quietly = TRUE)

    #### Checks
    if(! all(c('site_code', 'val', 'var', 'date') %in% names(chemistry))){
        stop('The argument to chemistry must contain precipitation chemistry or stream chemistry data in MacroSheds format (required columns: date, site_code, var, val).')
    }
    if(! all(c('site_code', 'val', 'var', 'date') %in% names(q))){
        stop('The argument to q must contain precipitation or stream discharge data in MacroSheds format (required columns: date, site_code, var, val).')
    }
    if(! grepl('(precipitation|discharge)', q_type)){
        stop('q_type must be "discharge" or "precipitation"')
    }
    if(! 'Date' %in% class(q$date)){
        q$date <- as.POSIXct(q$date)
    }
    if(! 'Date' %in% class(chemistry$date)){
        chemistry$date <- as.POSIXct(chemistry$date)
    }

    requireNamespace('macrosheds', quietly = TRUE)

    # Check the intervals are the same in both chemistry and q
    q_interval <- Mode(diff(as.numeric(q$datetime)))

    interval <- case_when(q_interval == 86400 ~ 'daily',
                          q_interval == 3600 ~ 'hourly',
                          q_interval == 1800 ~ '30 minute',
                          q_interval == 960 ~ '15 minute',
                          q_interval == 600 ~ '10 minute',
                          q_interval == 300 ~ '5 minute',
                          q_interval == 60 ~ '1 minute')

    # q_interval <- errors::as.errors(q_interval)


    flow_is_highres <- Mode(diff(as.numeric(q$datetime))) <= 15 * 60
    if(is.na(flow_is_highres)) { flow_is_highres <- FALSE }

    if(is.na(interval)) {
        stop(paste0('interval of samples must be one',
                    ' of: daily, hourly, 30 minute, 15 minute, 10 minute, 5 minute, or 1 minute.',
                    ' See macrosheds::ms_synchronize_timestep() to standardize your intervals.'))
    } else if(verbose) {
        print(paste0('q dataset has a ', interval, ' interval'))
    }

    ## add errors if they don't exist
    #if('val_err' %in% names(chemistry)){
    #    errors::errors(chemistry$val) <- chemistry$val_err

    #    chemistry <- chemistry %>%
    #        dplyr::select(-val_err)

    #} else if(all(errors::errors(chemistry$val) == 0)){
    #    errors::errors(chemistry$val) <- 0
    #}

    #if('val_err' %in% names(q)){
    #    errors::errors(q$val) <- q$val_err

    #    q <- q %>%
    #        dplyr::select(-val_err)

    #} else if(all(errors::errors(q$val) == 0)){
    #    errors::errors(q$val) <- 0
    #}
	chemistry <- dplyr::select(chemistry, -any_of('val_err'))
	q <- dplyr::select(q, -any_of('val_err'))

    # calc VWC
    sites <- unique(chemistry$site_code)

    all_sites_vwc <- tibble()
    for(s in 1:length(sites)) {

        site <- sites[s]

        site_chem <- chemistry %>%
            filter(site_code == !!site)

        site_q <- q %>%
            filter(site_code == !!site)

        daterange <- range(site_chem$datetime)

        site_q <- site_q %>%
            filter(
                site_code == !!site,
                datetime >= !!daterange[1],
                datetime <= !!daterange[2])

        if(nrow(site_q) == 0) { return(NULL) }

        chem_split <- site_chem %>%
            group_by(var) %>%
            arrange(datetime) %>%
            dplyr::group_split() %>%
            as.list()

        # Loop though all variables
        for(i in 1:length(chem_split)) {

            chem_chunk <- chem_split[[i]]

            chem_is_highres <- Mode(diff(as.numeric(chem_chunk$datetime))) <= 15 * 60
            if(is.na(chem_is_highres)) { chem_is_highres <- FALSE}

            #if both chem and flow data are low resolution (grab samples),
            #   let approxjoin_datetime match up samples with a 12-hour gap. otherwise the
            #   gap should be 7.5 mins so that there isn't enormous duplication of
            #   timestamps where multiple high-res values can be snapped to the
            #   same low-res value
            if(! chem_is_highres && ! flow_is_highres) {
                join_distance <- c('12:00:00')#, '%H:%M:%S')
            } else {
                join_distance <- c('7:30')#, '%M:%S')
            }

            chem_split[[i]] <- approxjoin_datetime(x = chem_chunk,
                                                   y = site_q,
                                                   rollmax = join_distance,
                                                   keep_datetimes_from = 'x')
          if(agg == "monthly") {
            agg_step <- c("year", "month")
            threshold <- 3
          } else if(agg == "yearly") {
            agg_step <- c("year")
            threshold <- 12
          } else {
            stop("user input aggregation type not recognized. accepted VWC aggregation time frames are 'monthly' and yearly'")
          }

          chem_split[[i]] <- chem_split[[i]] %>%
            filter_at(vars(val_x, val_y),
                      all_vars(!is.na(.))) %>%
            mutate(month = lubridate::month(datetime),
                   year = lubridate::year(datetime)) %>%
            group_by_at(agg_step) %>%
            summarize(site_code = site_code_x,
                      var = var_x,
                      val = mean(val_x * val_y)/mean(val_y),
                      n = n(),
                      ms_status = numeric_any_v(ms_status_x, ms_status_y),
                      ms_interp = numeric_any_v(ms_interp_x, ms_interp_y)) %>%
            dplyr::select(-starts_with(c('site_code_', 'var_', 'val_',
                                  'ms_status_', 'ms_interp_'))) %>%
            distinct() %>%
            arrange_at(agg_step)

          sub_pars <- sum(chem_split[[i]]$n < threshold)

          if(sub_pars > 0) {
            warning(paste(sub_pars, agg, "VWCs calculated with less than", threshold, "contributing samples"))
          }
        }

        vwc <- chem_split %>%
            purrr::reduce(bind_rows) %>%
            arrange_at(c("site_code", "var", agg_step))

        all_sites_vwc <- rbind(all_sites_vwc, vwc)

    }

    if(nrow(all_sites_vwc) == 0) { return(NULL) }

    #all_sites_vwc$val_err <- errors::errors(all_sites_vwc$val)
    #all_sites_vwc$val <- errors::drop_errors(all_sites_vwc$val)

    return(all_sites_vwc)
}