macrosheds: Tools for interfacing with the MacroSheds dataset

Documented in ms_run_egret

#' Run the WRTDS model through EGRET for MacroSheds data
#'
#' ms_run_egret prepares data for the EGRET package and runs the Weighted Regression
#' on Time, Discharge, and Season (WRTDS) model.
#'
#' @keywords internal
#' @author Spencer Rhea 
#' @author Mike Vlah, \email{vlahm13@@gmail.com}
#' @author Wes Slaughter
#' @param stream_chemistry \code{data.frame} in MacroSheds format (see [MacroSheds EDI](https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=1262)
#'     Must contain only one stream chemistry variable and one site_code to comply with WRTDS.
#'     A \code{tibble} of stream chemistry, in
#'     MacroSheds format, can be generated by [ms_load_product()].
#' @param discharge \code{data.frame} in MacroSheds format (see [MacroSheds EDI](https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=1262)
#'     Must contain discharge for only one site_code to comply with WRTDS.
#'     A \code{tibble} of discharge, in
#'     MacroSheds format, can be generated by [ms_load_product()].
#' @param prep_data logical. Should data be thinned/altered to follow EGRET recommendations?
#'     See details for more information.
#' @param run_egret logical. If FALSE, the EGRET model will not be run and an EGRET
#'     eList will be returned. This argument is intended for those who want to change
#'     default parameters in the EGRET model but need data in the eList format.
#' @param Kalman logical. Should EGRET run the Kalman filter on WRTDS results? See
#'     [EGRET::WRTDSKalman()] for more information.
#' @param site_data tibble. Default NULL, if you are using a non-MacroSheds site you must 
#'    supply a tibble with the columns: site_code, ws_area_ha, latitude, longitude. If
#'    using a MacroSheds site you can leave this argument NULL and the MacroSheds site_data
#'    table will be retrieved.
#' @param quiet logical. Should warnings be printed to console?
#' @return returns a \code{list} in the EGRET format with model results.
#' @details
#' WRTDS is a model to predict stream chemistry at a daily time step 
#' based on the season, discharge, and time since last grab sample. This model is 
#' useful for calculating flux when sampling is sparse (greater than a weekly 
#' sampling frequency). For more information on the WRTDS and EGRET package 
#' see <https://github.com/USGS-R/EGRET>.
#'
#' \code{prep_data} will remove NAs from the dataset, set all 0 values in stream 
#' chemistry dataset to the minimum value in the dataset (this is required because EGRET will
#' fail if there are 0 values in a dataset), remove years with fewer than 6 chemistry samples,
#' and remove datetimes when there is chemistry but no discharge data reported.
#' @seealso [ms_load_product()], [ms_conversions()]

ms_run_egret <- function(stream_chemistry, discharge, prep_data = TRUE, 
                         run_egret = TRUE, kalman = FALSE, quiet = FALSE,
                         site_data = NULL){

    library("dplyr", quietly = TRUE)

    check_suggested_pkgs(c('EGRET'))

    # Checks 
    if(any(! c('date', 'site_code', 'var', 'val') %in% names(stream_chemistry))){
        stop('stream_chemistry must be in MacroSheds format (required columns: date, site_code, var, val)')
    }
    
    if(any(! c('site_code', 'var', 'val', 'date') %in% names(discharge))){
        stop('discharge must be in MacroSheds format (required columns: date, site_code, var, val)')
    }
    
    if(! length(unique(macrosheds::ms_drop_var_prefix_(stream_chemistry$var))) == 1){
        stop('Only one chemistry variable can be run at a time.')
    }
    
    if((! length(unique(stream_chemistry$site_code)) == 1) || (! length(unique(discharge$site_code)) == 1)){
        stop('Only one site can be run in EGRET at a time')
    }
    
    if(! unique(stream_chemistry$site_code) == unique(discharge$site_code)){
        stop('stream_chemistry and discharge must contain the same site_code')
    }

    requireNamespace('macrosheds', quietly = TRUE)
    
    # Get var and site info
    if(is.null(site_data)){
        site_data <- macrosheds::ms_site_data
        
        if(! unique(stream_chemistry$site_code) %in% site_data$site_code){
            stop('This site is not in the MacroSheds dataset, provide a site_data table with the names: site_code, ws_area_ha, latitude, longitude')
        }
    } else{
        if(!all(names(site_data) %in% c('site_code', 'ws_area_ha', 'latitude', 'longitude'))){
            stop('If you are not using a macrosheds site, you must supply site_data with a tibble with the names: site_code, ws_area_ha, latitude, longitude')
        }
        site_data <- site_data %>% 
            mutate(site_type = 'stream_gauge')
    }
    
    ms_vars <- macrosheds::ms_vars_ts %>% 
        filter(unit != 'kg/ha/d') %>% 
        dplyr::select(variable_code, unit) %>% 
        distinct()
    
    site_code <- unique(stream_chemistry$site_code)
    
    #### Prep Files ####
    
    if(prep_data){
        
        # EGRET does not like NAs
        stream_chemistry <- stream_chemistry %>%
            filter(!is.na(val))
        discharge <- discharge %>%
            filter(!is.na(val))
        
        # Egret can't accept 0s in the column for min val (either hack egret or do this or
        # look for detection limits)
        min_chem <- stream_chemistry %>%
            filter(val > 0) %>%
            pull(val) %>%
            min()
        
        if(!min_chem == Inf){
            stream_chemistry <- stream_chemistry %>%
                mutate(val = ifelse(val == 0, !!min_chem, val))
        }
        
        # Filter so there is only Q going into the model that also has chem
        stream_chemistry <- stream_chemistry %>%
            mutate(year = lubridate::year(datetime),
                   month = lubridate::month(datetime)) %>%
            mutate(waterYear = ifelse(month %in% c(10, 11, 12), year+1, year))
        
        # Get years with at least 6 chemistry samples (bi-monthly sampling is a 
        # reasonable requirement?)
        years_with_data <- stream_chemistry %>%
            group_by(waterYear) %>%
            summarise(n = n()) %>%
            filter(n >= 6) %>%
            pull(waterYear)
        
        # Filter Q and chem to overlap in a water-year
        chem_dates <- get_start_end(stream_chemistry)
        q_dates <- get_start_end(discharge)
        
        start_date <- if(chem_dates[1] > q_dates[1]) { chem_dates[1] } else{ q_dates[1] }
        end_date <- if(chem_dates[2] < q_dates[2]) { chem_dates[2] } else{ q_dates[2] }
        
        discharge <- discharge %>%
            filter(datetime >= !!start_date,
                   datetime <= !!end_date)
        stream_chemistry <- stream_chemistry %>%
            filter(datetime >= !!start_date,
                   datetime <= !!end_date)
        
        # Filter discharge to only include water years with chemistry sampling 
        discharge <- discharge %>%
            mutate(year = lubridate::year(datetime),
                   month = lubridate::month(datetime)) %>%
            mutate(waterYear = ifelse(month %in% c(10, 11, 12), year+1, year)) %>%
            filter(waterYear %in% !!years_with_data)
        
        # Remove times when there is a chem sample and no Q reported
        samples_to_remove <- left_join(stream_chemistry, discharge, by = 'datetime') %>%
            filter(is.na(val.y)) %>%
            pull(datetime)
        
        stream_chemistry <- stream_chemistry %>%
            filter(!datetime %in% samples_to_remove)
    }
    
    # Set up EGRET Sample file 
    Sample_file <- tibble(Name = site_code,
                          Date = as.Date(stream_chemistry$datetime),
                          ConcLow = stream_chemistry$val,
                          ConcHigh = stream_chemistry$val,
                          Uncen = 1,
                          ConcAve = stream_chemistry$val,
                          Julian = as.numeric(julian(lubridate::ymd(stream_chemistry$datetime),origin=as.Date("1850-01-01"))),
                          Month = lubridate::month(stream_chemistry$datetime),
                          Day = lubridate::yday(stream_chemistry$datetime),
                          DecYear = decimalDate(stream_chemistry$datetime),
                          MonthSeq = get_MonthSeq(stream_chemistry$datetime)) %>%
        mutate(SinDY = sin(2*pi*DecYear),
               CosDY = cos(2*pi*DecYear))  %>%
        mutate(waterYear = ifelse(Month %in% c(10, 11, 12), lubridate::year(Date) + 1, lubridate::year(Date))) %>%
        dplyr::select(Name, Date, ConcLow, ConcHigh, Uncen, ConcAve, Julian, Month, Day,
               DecYear, MonthSeq, waterYear, SinDY, CosDY)
    
    # Set up EGRET Daily file 
    Daily_file <- tibble(Name = site_code,
                         Date = as.Date(discharge$datetime),
                         Q = discharge$val/1000,
                         Julian = as.numeric(julian(lubridate::ymd(discharge$datetime),origin=as.Date("1850-01-01"))),
                         Month = lubridate::month(discharge$datetime),
                         Day = lubridate::yday(discharge$datetime),
                         DecYear = decimalDate(discharge$datetime),
                         MonthSeq = get_MonthSeq(discharge$datetime),
                         Qualifier = discharge$ms_status) 
    
    if(prep_data){
        # Egret can't handle 0 in Q, setting 0 to the minimum Q ever reported seem reasonable 
        min_flow <- min(Daily_file$Q[Daily_file$Q > 0], na.rm = TRUE)
        
        no_flow_days <- Daily_file %>%
            filter(Q == 0) %>%
            pull(Date)
        
        Daily_file <- Daily_file %>%
            mutate(Q = ifelse(Q <= 0, !!min_flow, Q))
        
    }
    
    Daily_file <- Daily_file %>%
        mutate(Q7 = zoo::rollmean(Q, 7, fill = NA, align = 'right'),
               Q30 = zoo::rollmean(Q, 30, fill = NA, align = 'right'),
               LogQ = log(Q)) 
    
    Daily_file <- tibble::rowid_to_column(Daily_file, 'i') %>%
        dplyr::select(Name, Date, Q, Julian, Month, Day, DecYear, MonthSeq, Qualifier,
               i, LogQ, Q7, Q30)

    # Set up INFO table 
    var <- macrosheds::ms_drop_var_prefix_(unique(stream_chemistry$var))
    var_unit <- ms_vars %>%
        filter(variable_code == !!var) %>%
        pull(unit)
    site_lat <- site_data %>%
        filter(site_code == !!site_code) %>%
        pull('latitude')
    site_lon <- site_data %>%
        filter(site_code == !!site_code) %>%
        pull('longitude')
    site_ws_area <- site_data %>%
        filter(site_code == !!site_code,
               site_type == 'stream_gauge') %>%
        pull('ws_area_ha') 
    site_ws_area <- site_ws_area * 100
    
    new_point <- sf::st_sfc(sf::st_point(c(site_lon, site_lat)), crs = 4326) %>%
        sf::st_transform(., crs = 4267)
    
    INFO_file <- tibble(agency_cd = 'macrosheds',
                        site_no = site_code,
                        station_nm = site_code, 
                        site_tp_code = 'ST',
                        lat_va = site_lat,
                        long_va = site_lon,
                        dec_lat_va = site_lat,
                        dec_long_va = site_lon,
                        coord_meth_cd = NA,
                        coord_acy_cd = NA,
                        coord_datum_cd = NA,
                        dec_coord_datum_cd = NA,
                        district_cd = NA,
                        state_cd = NA,
                        county_cd = NA,
                        country_cd = NA,
                        land_net_ds = NA,
                        map_nm = NA,
                        map_scale_fc = NA,
                        alt_va = NA,
                        alt_meth_cd = NA,
                        alt_acy_va = NA,
                        alt_datum_cd = NA,
                        huc_cd = NA,
                        basin_cd = NA,
                        topo_cd = NA,
                        instruments_cd = NA,
                        construction_dt = NA,
                        inventory_dt = NA,
                        drain_area_va = site_ws_area*2.59,
                        contrib_drain_area_va = NA,
                        tz_cd = 'UTC',
                        local_time_fg = 'Y',
                        reliability_cd = NA,
                        gw_file_cd = NA,
                        nat_aqfr_cd = NA,
                        aqfr_cd = NA,
                        aqfr_type_cd = NA,
                        well_depth_va = NA,
                        hole_depth_va = NA,
                        depth_src_cd = NA,
                        project_no = NA,
                        shortName = site_code,
                        drainSqKm = site_ws_area,
                        staAbbrev = site_code,
                        param.nm = var,
                        param.units = var_unit,
                        paramShortName = var,
                        paramNumber = NA,
                        constitAbbrev = var,
                        paStart = 10,
                        paLong = 12)
    
    eList <- EGRET::mergeReport(INFO_file, Daily_file, Sample_file,
                                verbose = !quiet)
    
    if(! run_egret){
        return(eList)
    }
    
    eList <- try(EGRET::modelEstimation(eList, verbose = !quiet))
    
    if(inherits(eList, 'try-error')){
        stop('EGRET failed while running WRTDS. See https://github.com/USGS-R/EGRET for reasons data may not be compatible with the WRTDS model.')
    }
    
    if(kalman){
        eList <- EGRET::WRTDSKalman(eList, verbose = !quiet)
        
        if(prep_data){
            # Set flux and Q to 0 and conc to NA on no flow days
            eList$Daily <- eList$Daily %>%
                mutate(Q = ifelse(Date %in% !!no_flow_days, 0, Q),
                       LogQ = ifelse(Date %in% !!no_flow_days, 0, LogQ),
                       Q7 = ifelse(Date %in% !!no_flow_days, 0, Q7),
                       Q30 = ifelse(Date %in% !!no_flow_days, 0, Q30),
                       ConcDay = ifelse(Date %in% !!no_flow_days, NA, ConcDay),
                       FluxDay = ifelse(Date %in% !!no_flow_days, 0, FluxDay),
                       FNConc = ifelse(Date %in% !!no_flow_days, NA, FNConc),
                       FNFlux = ifelse(Date %in% !!no_flow_days, 0, FNFlux),
                       GenFlux = ifelse(Date %in% !!no_flow_days, 0, GenFlux),
                       GenConc = ifelse(Date %in% !!no_flow_days, 0, GenConc))
        }
    } else if(prep_data){
            # Set flux and Q to 0 and conc to NA on no flow days
            eList$Daily <- eList$Daily %>%
                mutate(Q = ifelse(Date %in% !!no_flow_days, 0, Q),
                       LogQ = ifelse(Date %in% !!no_flow_days, 0, LogQ),
                       Q7 = ifelse(Date %in% !!no_flow_days, 0, Q7),
                       Q30 = ifelse(Date %in% !!no_flow_days, 0, Q30),
                       ConcDay = ifelse(Date %in% !!no_flow_days, NA, ConcDay),
                       FluxDay = ifelse(Date %in% !!no_flow_days, 0, FluxDay),
                       FNConc = ifelse(Date %in% !!no_flow_days, NA, FNConc),
                       FNFlux = ifelse(Date %in% !!no_flow_days, 0, FNFlux))
    }
    return(eList)
}