R/ATTAINSCrosswalks.R

Defines functions TADA_CreateParamRef TADA_UpdateMonitoringLocationsInATTAINS TADA_GetATTAINSAUSiteCrosswalk

Documented in TADA_CreateParamRef TADA_GetATTAINSAUSiteCrosswalk TADA_UpdateMonitoringLocationsInATTAINS

#' Get Monitoring Location Identifier and Assessment Unit Identifier
#' Crosswalk from ATTAINS
#'
#' Tribes and States who participate in electronic reporting of water quality
#' conditions through EPA ATTAINS may optionally submit a crosswalk of WQP
#' monitoring location identifiers associated with their assessment units to
#' ATTAINS. If an organization has recorded this information in ATTAINS,
#' this function can be used to get the ATTAINS user submitted crosswalk of
#' known monitoring location identifiers and assessment units. As of 2025, all
#' tribal nations record this information in ATTAINS but only a few states.
#'
#' @param org_id The ATTAINS organization identifier must be supplied by the
#' user. A list of organization identifiers can be found by downloading the
#' ATTAINS Domains Excel file:
#' https://www.epa.gov/system/files/other-files/2025-02/domains_2025-02-25.xlsx.
#' Organization identifiers are listed in the "OrgName" tab. The "code" column
#' contains the organization identifiers that should be used for this param.
#'
#' @return A dataframe with four columns, MonitoringLocationIdentifier,
#' OrganizationIdentifier, ATTAINS.assessmentunitidentifier, and
#' MonitoringDataLinkText is returned. This is the crosswalk between monitoring
#' location identifiers and assessment units that the state or tribal
#' organization submitted to ATTAINS (optional). If an ATTAINS organization
#' has not submitted this information in ATTAINS, the function will not return
#' a dataframe.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Alaska example
#' AK_crosswalk <- TADA_GetATTAINSAUSiteCrosswalk(org_id = "AKDECWQ")
#'
#' # Pueblo of Tesuque example
#' PUEBLOOFTESUQUE_crosswalk <- TADA_GetATTAINSAUSiteCrosswalk(
#'   org_id = "PUEBLOOFTESUQUE"
#' )
#'
#' # Arizona example, returns blank dataframe as of 1/21/25
#' AZ_crosswalk <- TADA_GetATTAINSAUSiteCrosswalk(org_id = "21ARIZ")
#' }
#'
TADA_GetATTAINSAUSiteCrosswalk <- function(org_id = NULL) {
  org.ref <- TADA_GetATTAINSOrgIDsRef()

  if (!org_id %in% org.ref$code) {
    print(paste0(
      "TADA_GetATTAINSAUSiteCrosswalk: ",
      "The organization identifier entered by user is not found in ATTAINS."
    ))
  }

  if (org_id %in% org.ref$code) {
    rm(org.ref)

    au.info <- rATTAINS::assessment_units(organization_id = org_id)

    au.crosswalk <- au.info %>%
      tidyr::unnest(monitoring_stations) %>%
      dplyr::select(
        monitoring_location_identifier, monitoring_organization_identifier,
        assessment_unit_identifier, monitoring_data_link_text
      ) %>%
      dplyr::filter(!is.na(monitoring_location_identifier)) %>%
      dplyr::distinct() %>%
      dplyr::rename(
        ATTAINS.assessmentunitidentifier = assessment_unit_identifier,
        MonitoringLocationIdentifier = monitoring_location_identifier,
        OrganizationIdentifier = monitoring_organization_identifier,
        MonitoringDataLinkText = monitoring_data_link_text
      ) %>%
      # paste org_id in front of MLs from the specified org if they are missing
      # from ATTAINS
      dplyr::mutate(MonitoringLocationIdentifier = ifelse((
        OrganizationIdentifier == org_id &
          stringr::str_detect(MonitoringLocationIdentifier,
            org_id,
            negate = TRUE
          )),
      paste0(org_id, "-", MonitoringLocationIdentifier),
      MonitoringLocationIdentifier
      ))

    rm(au.info)

    if (length(au.crosswalk$MonitoringLocationIdentifier > 0)) {
      print(paste0(
        "TADA_GetATTAINSAUSiteCrosswalk: ",
        "There are ", nrow(au.crosswalk),
        " monitoring location identifiers associated with assessment units for ",
        org_id, " in ATTAINS."
      ))

      return(au.crosswalk)
    }

    if (length(au.crosswalk$MonitoringLocationIdentifier) == 0) {
      print(paste0(
        "TADA_GetATTAINSAUSiteCrosswalk: ",
        "No monitoring location identifiers were recorded in ATTAINS for ",
        org_id, " assessment units.", " No crosswalk can be returned."
      ))

      rm(au.crosswalk)
    }
  }
}



#' Create or Update Monitoring Location Identifier and Assessment Unit Identifier
#' Crosswalk in ATTAINS
#'
#' This function creates the batch upload files needed to create or update
#' Monitoring Location Identifiers in ATTAINS Assessment Unit profiles. Users
#' can specify whether all records should be overwritten (replaced) or if new
#' Monitoring Location Identifiers should be appended (added) to existing
#' records.
#'
#' ATTAINS batch upload files are available here:
#' https://www.epa.gov/waterdata/upload-data-resources-registered-attains-users#batch-upload-templates
#' See Assessment Unit Batch Upload Template.
#'
#' @param org_id Character argument. The ATTAINS organization identifier must
#' be supplied by the user. A list of organization identifiers can be found by
#' downloading the ATTAINS Domains Excel file:
#' https://www.epa.gov/system/files/other-files/2025-02/domains_2025-02-25.xlsx.
#' Organization identifiers are listed in the "OrgName" tab. The "code" column
#' contains the organization identifiers that should be used for this param.
#'
#' @param wqp_data_links Character argument. When wqp_data_links is equal to
#' "add" or "replace", the function will build the URL for the Water Quality
#' Portal Data Site page for each Monitoring Location Identifier in the data
#' frame. It will examine the response code of each URL and only retain those
#' with a 200 response, which indicates the URL is valid. When
#' wqp_data_links = "add", the URL will be added to any existing text in the
#' MS_DATA_LINK_TEXT column. When wqp_data_links = "replace", the URL will
#' replace any existing text in the MS_DATA_LINK_TEXT column. When
#' wqp_data_links = "none", no URLs will be created or added to the returned
#' data frame. Default is wqp_data_links = "add".
#'
#' @param attains_replace Character argument. When attains_replace = FALSE, all
#' Monitoring Location Identifiers in the user supplied crosswalk will be
#' appended to the existing ATTAINS crosswalk. When attains_replace = TRUE,
#' Monitoring Location Identifiers will only be retained if they are in the
#' user supplied crosswalk. Default equals FALSE.
#'
#' @param crosswalk A user-supplied dataframe with the columns
#' ASSESSMENT_UNIT_ID, MS_LOCATION_ID, MS_ORG_ID, and MONITORING_DATA_LINK_TEXT
#' is required. The ASSESSMENT_UNIT_ID and MS_LOCATION_ID must be filled out
#' in order to use this function. The additional columns,
#' MONITORING_DATA_LINK_TEXT, containing a single URL or "; " separated URLs
#' linking to information about the monitoring location, and MS_ORG_ID,
#' containing the WQP organization identifier for the monitoring location can
#' be left blank and the function will still run. Data link URLS to WQP site
#' pages cannot be automatically generated by this function unless the
#' MS_ORG_ID column is populated with the WQP OrganizationIdentifier. When
#' crosswalk = NULL, the crosswalk will be downloaded from ATTAINS. This allows
#' users to add URLs for the Water Quality Portal data site pages to the ATTAINS
#' assessment unit profile where possible without updating other information
#' in ATTAINS.
#'
#' @return A dataframe with four columns, MonitoringLocationIdentifier,
#' OrganizationIdentifier, ATTAINS.assessmentunitidentifier, and
#' MonitoringDataLinkText is returned. This matches the format of the batch
#' upload files required to add or update monitoring locations in ATTAINS.
#'
#' @seealso [TADA_GetATTAINSAUSiteCrosswalk()]
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Alaska example to updated data links with no user supplied crosswalk
#' AK_adddatalinks <- TADA_UpdateMonitoringLocationsInATTAINS(
#'   org_id = "AKDECWQ",
#'   crosswalk = NULL,
#'   attains_replace = FALSE,
#'   wqp_data_links = "replace"
#' )
#'
#' # Alaska example using a user supplied crosswalk to update entries in
#' # ATTAINS by appending user supplied information to ATTAINS crosswalk
#'
#' # example monitoring location identifiers
#' ASSESSMENT_UNIT_ID <- c(
#'   "AK_M_1021211_000", "AK_M_1021008_000",
#'   "AK_M_1021109_013", "AK_M_1021109_013",
#'   "AK_M_1021109_013"
#' )
#'
#' # example organization identifiers
#' MS_ORG_ID <- c("AKDECWQ", "AKDECWQ", "AKDECWQ", "AKDECWQ", "AKDECWQ")
#'
#' # example assessment units
#' MS_LOCATION_ID <- c(
#'   "ExampleSite1", "ExampleSite2", "ExampleSite3",
#'   "ExampleSite4", "ExampleSite5"
#' )
#' # example urls
#' MONITORING_DATA_LINK_TEXT <- c(
#'   "https://www.waterqualitydata.us/provider/STORET/AKDECWQ/",
#'   "https://www.waterqualitydata.us/provider/STORET/AKDECWQ/",
#'   "https://www.waterqualitydata.us/provider/STORET/AKDECWQ/",
#'   "https://www.waterqualitydata.us/provider/STORET/AKDECWQ/",
#'   "https://www.waterqualitydata.us/provider/STORET/AKDECWQ/"
#' )
#'
#' # create example crosswalk data frame
#' ex.user.cw <- data.frame(
#'   MS_LOCATION_ID, MS_ORG_ID, ASSESSMENT_UNIT_ID,
#'   MONITORING_DATA_LINK_TEXT
#' )
#'
#' AK_appenduserdata <- TADA_UpdateMonitoringLocationsInATTAINS(
#'   org_id = "AKDECWQ",
#'   crosswalk = ex.user.cw,
#'   attains_replace = FALSE,
#'   wqp_data_links = "none"
#' )
#' }
#'
TADA_UpdateMonitoringLocationsInATTAINS <- function(org_id = NULL,
                                                    crosswalk = NULL,
                                                    attains_replace = FALSE,
                                                    wqp_data_links = "add") {
  # get list of organization identifiers from ATTAINS
  org.ref <- utils::read.csv(system.file("extdata", "ATTAINSOrgIDsRef.csv",
    package = "EPATADA"
  ))

  # stop function if organization identifiers is not found in ATTAINS
  if (!org_id %in% org.ref$code) {
    stop(paste0(
      "TADA_UpdateMonitoringLocationsInATTAINS: ",
      "The organization identifier entered by user is not found in ATTAINS."
    ))
  }

  if (is.null(crosswalk) & attains_replace == TRUE) {
    stop(paste0(
      "TADA_UpdateMonitoringLocationsInATTAINS: ",
      "in order to replace MonitoringLocations stored in ATTAINS ",
      "(with attains_replace = TRUE), user must provide a ",
      "MonitoringLocation/AssessmentUnitcrosswalk."
    ))
  }

  if (org_id %in% org.ref$code) {
    # remove intermediate object
    rm(org.ref)

    # if the crosswalk is not a dataframe, stop the function
    if (!is.data.frame(crosswalk) & !is.null(crosswalk)) {
      stop(paste0(
        "TADA_UpdateMonitoringLocationsInATTAINS: ",
        "A crosswalk dataframe with columns 'ATTAINS.assessmentunit.identifier' and ",
        "'MonitoringLocationIdentifier' or setting crosswalk = NULL is required to run ",
        "this function."
      ))
    }

    # check that crosswalk is a dataframe before proceeding
    if (is.data.frame(crosswalk)) {
      # check crosswalk has all of the required columns
      expected_cols <- c(
        "ASSESSMENT_UNIT_ID",
        "MS_LOCATION_ID",
        "MS_ORG_ID",
        "MONITORING_DATA_LINK_TEXT"
      )

      TADA_CheckColumns(crosswalk, expected_cols)
    }

    if (attains_replace == FALSE) {
      # create assessment unit crosswalk from ATTAINS
      attains.crosswalk <- TADA_GetATTAINSAUSiteCrosswalk(org_id = org_id) %>%
        dplyr::rename(
          ASSESSMENT_UNIT_ID = ATTAINS.assessmentunitidentifier,
          MS_ORG_ID = OrganizationIdentifier,
          MS_LOCATION_ID = MonitoringLocationIdentifier,
          MONITORING_DATA_LINK_TEXT = MonitoringDataLinkText
        )

      if (is.null(crosswalk)) {
        update.crosswalk <- attains.crosswalk


        rm(attains.crosswalk)
      }

      if (!is.null(crosswalk)) {
        # combine user supplied and attains crosswalks to create one crosswalk
        # no rows are omitted
        update.crosswalk <- attains.crosswalk %>%
          dplyr::full_join(crosswalk, by = dplyr::join_by(
            MS_LOCATION_ID,
            MS_ORG_ID,
            ASSESSMENT_UNIT_ID,
            MONITORING_DATA_LINK_TEXT
          )) %>%
          dplyr::distinct()

        rm(attains.crosswalk, crosswalk)
      }
    }

    # when replace is true, only rows in user-supplied crosswalk are used

    if (attains_replace == TRUE) {
      update.crosswalk <- crosswalk

      rm(attains.crosswalk, crosswalk)
    }

    # add Monitoring Location data links if wqp_data_links is not equal to "none"

    if (wqp_data_links != "none") {
      # get org/provider name ref

      provider.ref <- TADA_GetWQPOrgProviderRef() %>%
        dplyr::select(OrganizationIdentifier, ProviderName) %>%
        dplyr::distinct() %>%
        dplyr::rename(MS_ORG_ID = OrganizationIdentifier) %>%
        dplyr::mutate(OrgIDForURL = MS_ORG_ID)

      # add additional rows to account for the addition of "_WQX" to many org
      # names for WQP data
      add.orgs <- provider.ref %>%
        dplyr::filter(grepl("_WQX", MS_ORG_ID)) %>%
        dplyr::mutate(MS_ORG_ID = stringr::str_remove_all(
          OrgIDForURL,
          "_WQX"
        ))

      # combine provider refs
      provider.ref <- provider.ref %>%
        dplyr::bind_rows(add.orgs)

      # remove intermediate object
      rm(add.orgs)

      # join provider ref df to crosswalk
      update.crosswalk <- update.crosswalk %>%
        dplyr::left_join(provider.ref, by = dplyr::join_by(MS_ORG_ID))

      # next build the URLS for ms location urls
      update.crosswalk <- update.crosswalk %>%
        dplyr::mutate(MONITORING_DATA_LINK_TEXT.New = ifelse(
          is.na(OrgIDForURL), NA,
          URLencode(paste0(
            "https://www.waterqualitydata.us/provider/", ProviderName,
            "/", OrgIDForURL, "/", MS_LOCATION_ID, "/"
          ))
        ))

      # create df of urls to check
      urls.to.check <- update.crosswalk %>%
        dplyr::filter(!is.na(MONITORING_DATA_LINK_TEXT.New))

      # retrieve http response headers from url list
      headers <- urls.to.check$MONITORING_DATA_LINK_TEXT.New %>%
        purrr::map(~ tryCatch(curlGetHeaders(.x), error = function(e) NA))

      # extract response code from first line of header response
      response.code <- sapply(headers, "[[", 1)

      # create dataframe of urls and response codes
      response.df <- data.frame(urls.to.check, response.code) %>%
        dplyr::distinct()

      # join response codes to add.urls df
      update.crosswalk <- update.crosswalk %>%
        dplyr::left_join(response.df, by = names(update.crosswalk))
    }

    if (wqp_data_links == "replace") {
      update.crosswalk <- update.crosswalk %>%
        dplyr::mutate(MONITORING_DATA_LINK_TEXT = ifelse(
          grepl("200", response.code), MONITORING_DATA_LINK_TEXT.New,
          MONITORING_DATA_LINK_TEXT
        )) %>%
        dplyr::select(
          ASSESSMENT_UNIT_ID, MS_ORG_ID,
          MS_LOCATION_ID, MONITORING_DATA_LINK_TEXT
        ) %>%
        dplyr::distinct()
    }

    if (wqp_data_links == "add") {
      update.crosswalk <- update.crosswalk %>%
        dplyr::mutate(
          MONITORING_DATA_LINK_TEXT = ifelse(
            grepl("200", response.code),
            paste0(
              MONITORING_DATA_LINK_TEXT, "; ",
              MONITORING_DATA_LINK_TEXT.New
            ),
            MONITORING_DATA_LINK_TEXT
          ),
          MONITORING_DATA_LINK_TEXT = stringr::str_remove_all(
            MONITORING_DATA_LINK_TEXT,
            "NA, "
          )
        ) %>%
        tidyr::separate_rows(MONITORING_DATA_LINK_TEXT, sep = ", ") %>%
        dplyr::group_by(ASSESSMENT_UNIT_ID, MS_ORG_ID, MS_LOCATION_ID) %>%
        suppressMessages(dplyr::summarise(
          MONITORING_DATA_LINK_TEXT =
            paste(
              unique(
                MONITORING_DATA_LINK_TEXT
              ),
              collapse = ", "
            )
        )) %>%
        dplyr::select(
          ASSESSMENT_UNIT_ID,
          MS_ORG_ID, MS_LOCATION_ID,
          MONITORING_DATA_LINK_TEXT
        ) %>%
        dplyr::distinct()
    }
    return(update.crosswalk)
  }
}



#' Create or Update ATTAINS, TADA/WQP/WQX, and EPA Criteria Search Tool (CST)
#' Parameter Name Crosswalk
#'
#' Use this function to help generate a crosswalk between each
#' ATTAINS.ParameterName used by a specific state or tribal nation and each
#' TADA.ComparableDataIdentifier present in the input TADA dataframe. The
#' crosswalk can be filled out by users within R or Excel. By default this
#' function will generate a user friendly Excel spreadsheet that includes a
#' drop down list list of all ATTAINS parameters that are applicable to the
#' organization selected by the function input 'org_id'. It also
#' highlights the cells in which users should input information. The excel
#' spreadsheet will be automatically downloaded to a user's downloads folder path.
#' Users may need to insert additional rows into the crosswalk if:
#' 1) an ATTAINS.ParameterName corresponds with multiple TADA.ComparableDataIdentifiers
#'    Example: An organization uses "ALUMINUM" for all aluminum related parameter causes
#'    but this ATTAINS.ParameterName may crosswalk to "ALUMINUM_TOTAL_NA_UG/L"
#'    for one use and "ALUMINUM_DISSOLVED_NA_UG/L" for another use; or
#' 2) an TADA.ComparableDataIdentifiers corresponds with multiple ATTAINS.ParameterNames.
#'    Example: An organization uses both "pH, HIGH" and "pH, LOW" as ATTAINS.ParameterNames,
#'    but both crosswalk to the same TADA.ComparableDataIdentifier, "PH_NA_NA_STD UNITS".
#'
#' Users who have already created an ATTAINS parameter and TADA/WQP characteristic
#' crosswalk can provide it as an input to this function. The user-supplied
#' crosswalk (dataframe entered into paramRef function input) must contain the
#' two required columns: TADA.ComparableDataIdentifier and ATTAINS.ParameterName.
#' In addition, users who are interested in performing analyses for more than
#' one organization (multiple states or tribes, or a single state/tribe and
#' EPA 304a criteria) also need to include an additional column name:
#' 'organization_identifier'. This ensures that the crosswalk between
#' TADA.ComparableDataIdentifier and ATTAINS.ParameterName are specific and
#' accurate for each organization. If a crosswalk has already been created in the
#' past and is entered into this function as a starting point, then any
#' TADA.ComparableDataIdentifiers that were previously matched
#' with ATTAINS parameters will be retained in the crosswalk, and any new
#' TADA.ComparableDataIdentifiers from the new input data frame will be added
#' to the crosswalk. Users can then focus on matching only the new
#' TADA.ComparableDataIdentifiers with applicable ATTAINS parameter names.
#'
#' The EPA TADA team created a draft crosswalk between characteristic
#' names (TADA.ComparableDataIdentifier) and EPA 304A pollutant names
#' (sourced from the Criteria Search Tool:
#' https://www.epa.gov/wqs-tech/state-specific-water-quality-standards-effective-under-clean-water-act-cwa)
#' This crosswalk only includes priority characteristics identified by the TADA
#' Working Group. You are welcome to reach out to the TADA team to ask for
#' additional matches to be included. You may run the following line of code
#' in the console to review this crosswalk:
#' 'CSTtoATTAINSParamCrosswalk <- utils::read.csv(system.file("extdata", "TADAPriorityCharUnitRef.csv", package = "EPATADA"))'.
#'
#' If no existing ATTAINS parameter name corresponds with a specific
#' TADA.ComparableDataIdentifier, users may contact the ATTAINS helpdesk
#' \email{attains@epa.gov} to inquire about adding the parameter. Users are
#' free to use any ATTAINS parameter name found in the ATTAINS parameter domain
#' value list, even if the parameter name  has not previously
#' been listed as a cause by the specific organization in the
#' past. The full list of ATTAINS parameter names can be found by downloading
#' the ATTAINS Domains Excel file:
#' https://www.epa.gov/system/files/other-files/2025-02/domains_2025-02-25.xlsx.
#' In the meantime, users can proceed by overriding the data validation in Excel
#' by value pasting. In that case, users will be warned in the
#' ATTAINS.FlagParameterName column that they choose to include an
#' ATTAINS.ParameterName that was not used by the selected organization in prior
#' ATTAINS assessment cycles.
#'
#' @param .data A TADA dataframe. We recommend running all desired data
#' cleaning, processing, harmonization, filtering, QAQC, and handling of
#' censored data prior to running TADA_CreateParamRef.
#'
#' @param org_id The ATTAINS organization identifier must be supplied by the
#' user. A list of organization identifiers can be found by downloading
#' the ATTAINS Domains Excel file:
#' https://www.epa.gov/system/files/other-files/2025-02/domains_2025-02-25.xlsx.
#' Organization identifiers are listed in the "OrgName" tab.
#' The "code" column contains the organization identifiers that
#' should be used for this parameter. If a user supplied crosswalk is entered
#' into paramRef AND a user does not provide an org_id argument,
#' the function can identify which organization identifier(s) to include
#' based on the unique ATTAINS organization identifiers found in the dataframe.
#'
#' @param excel A Boolean value that returns an excel spreadsheet if
#' excel = TRUE. This spreadsheet is created in the user's downloads folder path.
#' If needed, please type the following into your R console:
#' file.path(Sys.getenv("USERPROFILE"), "Downloads") to ensure the file is downloaded
#' to the correct location. The file will be named "myfileRef.xlsx".
#'
#' @param overwrite A Boolean value that ensures the function will not overwrite
#' the user supplied crosswalk entered into this function via the paramRef
#' function input. This helps prevent users from overwriting their progress.
#'
#' @param paramRef A dataframe which contains a completed crosswalk between
#' TADA_ComparableDataIdentifier and ATTAINS.ParameterName. Users will need to
#' ensure this crosswalk contains the appropriate column names in order to
#' run the function. paramRef must contain at least these two column names:
#' TADA.ComparableDataIdentifier and ATTAINS.ParameterName. Users who are
#' interested in performing analyses for more than
#' one organization (multiple states or tribes, or a single state/tribe and
#' EPA 304a criteria) also need to include an additional column name:
#' 'organization_identifier'.
#'
#' @return A excel file or data frame which contains the columns:
#' TADA.ComparableDataIdentifier, organization_identifier, EPA304A.PollutantName,
#' ATTAINS.ParameterName, and ATTAINS.FlagParameterName. Users will need to
#' complete the crosswalk between ATTAINS.ParameterName and
#' TADA.ComparableDataIdentifier.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # This creates a blank paramRef template of UT Nutrients data.
#' # Users will need to fill this template out.
#' # Uncomment example below to generate Excel file
#' # (we recommended working on this in Excel):
#' # TADA_CreateParamRef(Data_Nutrients_UT, org_id = "UTAHDWQ", excel = TRUE)
#' # Example below generates the same output as a dataframe
#' paramRef_UT <- TADA_CreateParamRef(
#'   Data_Nutrients_UT,
#'   org_id = "UTAHDWQ", excel = FALSE
#' )
#' # Users can choose to edit the paramRef_UT through the R environment or in
#' # the excel spreadsheet. Users should be aware that any updates done only
#' # in the R environment will not reflect the 'ATTAINS.FlagParameterName' values
#' # correctly. If completed in R, we recommend users rerun this function
#' # to update the 'ATTAINS.FlagParameterName'.
#' # See below for a simple example of this workflow:
#'
#' # Manually add ATTAINS parameters to crosswalk using R
#' paramRef_UT2 <- dplyr::mutate(paramRef_UT,
#'   ATTAINS.ParameterName = dplyr::case_when(
#'     grepl("AMMONIA", TADA.ComparableDataIdentifier) ~ "AMMONIA, TOTAL",
#'     grepl("NITRATE", TADA.ComparableDataIdentifier) ~ "NITRATE",
#'     grepl("NITROGEN", TADA.ComparableDataIdentifier) ~
#'       "NITRATE/NITRITE (NITRITE + NITRATE AS N)"
#'   )
#' )
#' # Update the 'ATTAINS.FlagParameterName' values
#' paramRef_UT3 <- TADA_CreateParamRef(Data_Nutrients_UT,
#'   paramRef = paramRef_UT2,
#'   org_id = "UTAHDWQ", excel = FALSE
#' )
#'
#' # Example where multiple org_id's are selected
#' # First, run key flag functions and harmonize synonyms across
#' # characteristic, fraction, and speciation columns
#' Data_NCTCShepherdstown <-
#'   TADA_RunKeyFlagFunctions(Data_NCTCShepherdstown_HUC12)
#' Data_NCTCShepherdstown2 <- TADA_HarmonizeSynonyms(Data_NCTCShepherdstown)
#' # Create ATTAINS parameter crosswalk for MD, VA, and PA
#' paramRef_NCTC <- TADA_CreateParamRef(Data_NCTCShepherdstown2,
#'   org_id =
#'     c("MDE_EASP", "21VASWCB", "21PA"), excel = FALSE
#' )
#' }
#'
TADA_CreateParamRef <- function(.data, org_id = NULL, paramRef = NULL, excel = TRUE,
                                overwrite = FALSE) {
  # check to see if user-supplied parameter ref is a df with appropriate columns
  if (!is.null(paramRef) & !is.character(paramRef)) {
    if (!is.data.frame(paramRef)) {
      stop(paste0(
        "TADA_CreateParamRef: 'paramRef' must be a data frame with these 2 columns:",
        "TADA.ComparableDataIdentifier and ATTAINS.ParameterName"
      ))
    }

    if (is.data.frame(paramRef)) {
      col.names <- c(
        "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName"
      )

      ref.names <- names(paramRef)

      # Users are required to provide a parameter ref that contains
      # TADA.ComparableDataIdentifier and ATTAINS.ParameterName
      if (length(setdiff(col.names, ref.names)) > 0 &&
        !("TADA.ComparableDataIdentifier" %in% names(paramRef))) {
        stop(paste0(
          "TADA_CreateParamRef: 'paramRef' must be a data frame with these 2 columns:",
          "TADA.ComparableDataIdentifier and ATTAINS.ParameterName"
        ))
      }
    }
  }

  .data <- as.data.frame(.data)

  # If users don't provide TADA.ComparableDataIdentifier in their paramRef input,
  # crosswalk using TADA.CharacteristicName, TADA.MethodSpeciationName, TADA.ResultSampleFractionText
  if (!is.null(paramRef) & !("TADA.ComparableDataIdentifier" %in% names(paramRef))) {
    paramRef <- paramRef %>%
      dplyr::left_join(
        .data, c(
          "TADA.CharacteristicName", "TADA.MethodSpeciationName",
          "TADA.ResultSampleFractionText"
        )
      ) %>%
      dplyr::select(
        "TADA.CharacteristicName", "TADA.ComparableDataIdentifier", "organization_identifier",
        "EPA304A.PollutantName", "ATTAINS.ParameterName", "ATTAINS.FlagParameterName"
      )
  }

  # if a user provides an org_id argument, it must be a character vector.
  if (!is.character(org_id) & !is.null(org_id)) {
    stop("TADA.CreateParamRef: org_id must be a character vector or left as NULL")
  }

  # if user doesn't provide an org_id argument, the function extracts the unique
  # org_id from TADA_GetATTAINS(). Users will need to have ran TADA_GetATTAINS()
  # for this option to be allowed. Selection of org_id will filter the ATTAINS
  # domain list for parameter and use name by org_id.
  if (is.null(org_id)) {
    TADA_CheckColumns(.data, "ATTAINS.organizationname")
    print(paste0(
      "TADA.CreateParamRef: No organization identifier(s) provided.",
      "Attempting to pull in organization identifiers found in the TADA data frame.",
      "Users are required to run TADA_GetATTAINS if an org_id argument input is not provided."
    ))
    org_id <- unique(stats::na.omit(.data[, "ATTAINS.organizationname"]))
  }

  org_id <- as.list(org_id)

  # If  more than 1 org, it will create n duplicate rows for each TADA.ComparableDataIdentifier.
  if (length(org_id) > 1) {
    print(paste0(
      "TADA.CreateParamRef: More than one org_name was defined in your dataframe.",
      "Generating duplicate rows of TADA.ComparableDataIdentifier for each org."
    ))
  }

  # overwrite argument should only be used when creating an excel file.
  if (excel == FALSE && overwrite == TRUE) {
    stop(paste0(
      "argument input excel = FALSE and overwrite = TRUE is an invalid combination."
    ))
  }

  # 304a parameter name and standards are pulled in from the Criteria Search Tool (CST)
  CST_param <- utils::read.csv(system.file("extdata", "CST.csv", package = "EPATADA"))

  # Pulls in all unique combinations of TADA.ComparableDataIdentifier in user's dataframe.
  TADA_param <- dplyr::distinct(
    .data[, c("TADA.CharacteristicName", "TADA.ComparableDataIdentifier")]
  ) %>%
    dplyr::left_join(CST_param, "TADA.CharacteristicName") %>%
    dplyr::select(
      TADA.CharacteristicName, TADA.ComparableDataIdentifier,
      EPA304A.PollutantName = POLLUTANT_NAME
    ) %>%
    dplyr::arrange(TADA.CharacteristicName, TADA.ComparableDataIdentifier) %>%
    tidyr::uncount(weights = length(org_id)) %>%
    dplyr::mutate(organization_identifier = rep(org_id, nrow(.) / length(org_id)))

  TADA_param$organization_identifier <- as.character(TADA_param$organization_identifier)

  # Pulls in all domain values of parameter names in ATTAINS.
  ATTAINS_param_all <- utils::read.csv(
    system.file("extdata", "ATTAINSParamUseEntityRef.csv", package = "EPATADA")
  )

  ATTAINS_param <- ATTAINS_param_all %>%
    dplyr::filter(organization_identifier %in% org_id) %>%
    dplyr::arrange(parameter)

  # Should we stop or warn users in this step?
  if (sum(!org_id %in% ATTAINS_param_all$organization_identifier) > 0) {
    warning(paste0(
      "TADA_CreateParamRef: ",
      "One or more organization identifiers entered by user is not found in ATTAINS."
    ))
  }

  # Print message if there are many combinations of TADA Characteristic as it may slow run time.
  n <- nrow(dplyr::distinct(.data[, c("TADA.CharacteristicName", "TADA.ComparableDataIdentifier")]))
  if (n > 100 & excel == TRUE) {
    message(paste0("There are ", n, " unique TADA.ComparableDataIdentifier names in your TADA data frame.
    This may result in slow runtime for TADA_CreateParamRef() when generating the excel spreadsheet."))
  }

  # If no paramRef is provided, the ATTAINS.ParameterName returns a blank column of NA that will need user input.
  if (is.null(paramRef)) {
    CreateParamRef <- TADA_param %>%
      dplyr::mutate(ATTAINS.ParameterName = NA) %>%
      dplyr::select(
        TADA.ComparableDataIdentifier, organization_identifier,
        EPA304A.PollutantName, ATTAINS.ParameterName
      ) %>%
      dplyr::arrange(organization_identifier) %>%
      dplyr::distinct()
  }

  # If a user does provide a paramRef, this joins the dataframe to match the final output.
  # rename() is used as we may need to consider if we want to consider allowing users
  # to crosswalk CST parameter names or modify EPA 304A pollutant names on their ends.
  if (!is.null(paramRef)) {
    paramRef$organization_identifier <- as.character(paramRef$organization_identifier)

    CreateParamRef <- TADA_param %>%
      dplyr::left_join(
        paramRef,
        c(
          "TADA.ComparableDataIdentifier", "organization_identifier"
        ),
        relationship = "many-to-many"
      ) %>%
      dplyr::rename(
        dplyr::any_of(
          c(
            EPA304A.PollutantName = "EPA304A.PollutantName.x",
            organization_identifier = "organization_identifier.x",
            ATTAINS.ParameterName = "ATTAINS.ParameterName.y"
          )
        )
      ) %>%
      dplyr::select(
        TADA.ComparableDataIdentifier, organization_identifier,
        EPA304A.PollutantName, ATTAINS.ParameterName
      ) %>%
      dplyr::filter(organization_identifier %in% org_id) %>%
      dplyr::arrange(organization_identifier) %>%
      dplyr::distinct()
  }

  # Re-runs the flagging data after a user has inputted values.
  # Will need to be done if a user only inputs values in the R environment and not in excel.
  if (is.null(paramRef)) {
    CreateParamRef <- CreateParamRef %>%
      dplyr::mutate(
        ATTAINS.ParameterName = NA,
        ATTAINS.FlagParameterName = "No parameter crosswalk provided. Parameter will not be used for assessment"
      )
  }

  if (!is.null(paramRef)) {
    Flag1 <- paramRef %>%
      dplyr::anti_join(
        ATTAINS_param_all,
        by = c("ATTAINS.ParameterName" = "parameter", "organization_identifier")
      ) %>%
      dplyr::select(ATTAINS.ParameterName, organization_identifier) %>%
      dplyr::distinct() %>%
      dplyr::mutate(ATTAINS.FlagParameterName1 = dplyr::case_when(
        ATTAINS.ParameterName == "No parameter match for TADA.ComparableDataIdentifier" | is.na(ATTAINS.ParameterName) ~
          "No parameter crosswalk provided for TADA.ComparableDataIdentifier. Parameter will not be used for assessment",
        !ATTAINS.ParameterName %in% ATTAINS_param_all$parameter ~
          "Parameter name is not included in ATTAINS, contact ATTAINS to add parameter name to Domain List",
        !ATTAINS.ParameterName %in% ATTAINS_param_all[ATTAINS_param_all$organization_identifier == organization_identifier, "parameter"] ~
          "Parameter name is listed as a prior cause in ATTAINS, but not for this organization"
      ))

    CreateParamRef <- CreateParamRef %>%
      dplyr::left_join(Flag1, c("ATTAINS.ParameterName", "organization_identifier")) %>%
      dplyr::mutate(ATTAINS.FlagParameterName = dplyr::case_when(
        is.na(ATTAINS.ParameterName) ~
          "No parameter crosswalk provided for TADA.ComparableDataIdentifier. Parameter will not be used for assessment",
        !is.na(ATTAINS.FlagParameterName1) ~
          ATTAINS.FlagParameterName1,
        is.na(ATTAINS.FlagParameterName1) ~
          "Parameter name is listed as a prior cause in ATTAINS for this organization"
      )) %>%
      dplyr::select(
        TADA.ComparableDataIdentifier, organization_identifier,
        EPA304A.PollutantName, ATTAINS.ParameterName, ATTAINS.FlagParameterName
      ) %>%
      dplyr::distinct()

    # remove intermediate object Flag1
    rm(Flag1)
  }

  # Excel ref files to be stored in the Downloads folder location.
  downloads_path <- file.path(Sys.getenv("USERPROFILE"), "Downloads", "myfileRef.xlsx")

  if (excel == TRUE) {
    # Create column names for an empty dataframe
    columns <- c(
      "TADA.ComparableDataIdentifier", "EPA304A.PollutantName",
      "ATTAINS.ParameterName", "organization_identifier", "ATTAINS.FlagParameterName"
    )

    par <- data.frame(matrix(nrow = 0, ncol = length(columns))) # empty dataframe with just column names
    colnames(par) <- columns

    wb <- openxlsx::createWorkbook()
    openxlsx::addWorksheet(wb, "ATTAINSOrgNamesParamRef", visible = TRUE)
    openxlsx::addWorksheet(wb, "CreateParamRef", visible = TRUE)
    openxlsx::addWorksheet(wb, "Index", visible = FALSE)

    # set zoom size
    set_zoom <- function(x) gsub('(?<=zoomScale=")[0-9]+', x, sV, perl = TRUE)
    sV <- wb$worksheets[[2]]$sheetViews
    wb$worksheets[[2]]$sheetViews <- set_zoom(90)
    # Format header and bodystyle
    header_st <- openxlsx::createStyle(textDecoration = "Bold")
    bodyStyle <- openxlsx::createStyle(wrapText = TRUE)

    # Index of allowable values for drop-down lists
    openxlsx::writeData(
      wb, "Index",
      startCol = 4,
      x = rbind(
        ATTAINS_param_all,
        c(rep("NA", 3), "No use name match for TADA.ComparableDataIdentifier", "No parameter match for TADA.ComparableDataIdentifier")
      )
    )

    openxlsx::writeData(
      wb, "Index",
      startCol = 3,
      x = data.frame(
        organization_identifier = c(unique(ATTAINS_param$organization_identifier))
      )
    )

    openxlsx::writeData(
      wb, "Index",
      startCol = 2,
      x = unique(CST_param$CST.PollutantName)
    )

    openxlsx::writeData(
      wb, "Index",
      startCol = 1,
      x = data.frame(
        ATTAINS.ParameterName = c(unique(ATTAINS_param$parameter), "No parameter match for TADA.ComparableDataIdentifier")
      )
    )

    # Format column widths in CreateParamRef - for future considerations of formatting
    openxlsx::setColWidths(wb, "CreateParamRef", cols = 1:ncol(par), widths = "auto")

    openxlsx::writeData(wb, "CreateParamRef", startCol = 1, x = CreateParamRef, headerStyle = header_st)

    # Creates a tab that contains the ATTAINS parameter-use filtered by the org_id input.
    openxlsx::writeData(wb, "ATTAINSOrgNamesParamRef", startCol = 1, x = ATTAINS_param, headerStyle = header_st)

    # The list of allowable values for each column in excel tab [CreateParamRef] will be defined by the [Index] tab

    # Note: If we make edits to the data validation, please ensure the entire
    # data frame column is being referenced.
    # Ex. data validation will capture values in tab [Index] column h, for rows 2:50000 for input, value = sprintf("'Index'!$H$2:$H$50000")
    suppressWarnings(
      openxlsx::dataValidation(
        wb,
        sheet = "CreateParamRef",
        cols = 2, rows = 2:1000,
        type = "list",
        value = sprintf("'Index'!$C$2:$C$5000"),
        allowBlank = TRUE, showErrorMsg = TRUE, showInputMsg = TRUE
      )
    )

    suppressWarnings(
      openxlsx::dataValidation(
        wb,
        sheet = "CreateParamRef",
        cols = 3, rows = 2:1000,
        type = "list",
        value = sprintf("'Index'!$B$2:$B$5000"),
        allowBlank = TRUE, showErrorMsg = TRUE, showInputMsg = TRUE
      )
    )

    suppressWarnings(
      openxlsx::dataValidation(
        wb,
        sheet = "CreateParamRef",
        cols = 4, rows = 2:1000,
        type = "list",
        value = sprintf("'Index'!$H$2:$H$50000"),
        allowBlank = TRUE, showErrorMsg = TRUE, showInputMsg = TRUE
      )
    )

    # remove intermediate objects
    rm(CST_param, ATTAINS_param, ATTAINS_param_all)

    for (i in 1:nrow(CreateParamRef)) {
      openxlsx::writeFormula(
        wb, "CreateParamRef",
        startCol = 5,
        startRow = i + 1,
        array = TRUE,
        x = paste0(
          "=IF(OR(D", i + 1, '="",D', i + 1,
          '="No parameter match for TADA.ComparableDataIdentifier"),"No parameter crosswalk provided for TADA.ComparableDataIdentifier. Parameter will not be used for assessment",
          IF(ISNA(MATCH(D', i + 1, ',Index!H:H,0)),
            "Parameter name is not included in ATTAINS, contact ATTAINS to add parameter name to Domain List",
          IF(ISNA(MATCH(1,(D', i + 1, "=ATTAINSOrgNamesParamRef!E:E)*(B", i + 1, '=ATTAINSOrgNamesParamRef!A:A),0)),
            "Parameter name is listed as a prior cause in ATTAINS, but not for this organization",
            "Parameter name is listed as a prior cause in ATTAINS for this organization")))'
        )
      )

      openxlsx::conditionalFormatting(
        wb, "CreateParamRef",
        cols = 4,
        rows = i + 1,
        type = "blanks",
        style = openxlsx::createStyle(bgFill = TADA_ColorPalette()[13])
      )

      openxlsx::conditionalFormatting(
        wb, "CreateParamRef",
        cols = 4,
        rows = i + 1,
        type = "notBlanks",
        style = openxlsx::createStyle(bgFill = TADA_ColorPalette()[8])
      )
    }

    # remove intermediate object TADA_param
    rm(TADA_param)

    if (overwrite == TRUE) {
      message(
        paste0("Overwriting sheet [CreateParamRef] in ", downloads_path)
      )
      openxlsx::saveWorkbook(wb, downloads_path, overwrite = T)
    }

    if (overwrite == FALSE) {
      message(
        "If you would like to replace sheet [CreateParamRef], use overwrite = TRUE argument in TADA_CreateParamRef"
      )
      openxlsx::saveWorkbook(wb, downloads_path, overwrite = F)
    }

    cat("File saved to:", gsub("/", "\\\\", downloads_path), "\n")
  }

  return(CreateParamRef)
}



#' Create or Update ATTAINS Parameter and Use crosswalk
#'
#' This function generates a crosswalk of all parameters and uses applicable
#' to the selected organization(s) in ATTAINS. Users should review and validate
#' each ATTAINS.ParameterName and associated use_name combination.
#' As part of this review process, users should check to make sure each
#' 'use_name' from the drop-down menu in the excel spreadsheet generated by this
#' function also accurately corresponds to the correct TADA.ComparableDataIdentifier
#' and ATTAINS.ParameterName found in the TADA dataframe. This function should be
#' run after creating a parameter (ATTAINS.ParameterName and
#' TADA.ComparableDataIdentifier) crosswalk.
#'
#' Before running this function, users must run TADA_CreateParamRef() to
#' create the crosswalk that defines the ATTAINS.ParameterName(s) and
#' use_name(s) needing validation. All unique use_names from prior ATTAINS
#' assessment cycles are pulled in using TADA_CreateUseParamRef(). If a user has
#' defined multiple TADA.ComparableDataIdentifier matches to an
#' ATTAINS.ParameterName, they will need to define whether every
#' TADA.ComparableDataIdentifier matches to an associated use_name.
#' If certain parameter and use combinations only apply to certain
#' TADA.ComparableDataIdentifier(s), users will need to select 'NA' or leave it
#' as blank to properly capture this logic.
#'
#' If an ATTAINS use name is not listed as a prior domain value for your organization
#' from prior ATTAINS assessment cycles, users can contact the ATTAINS helpdesk
#' \email{attains@epa.gov} to inquire about adding the use to the ATTAINS
#' domain list. Otherwise, users can still
#' proceed by overriding the data validation by value pasting in Excel.
#' Users will be warned in the ATTAINS.FlagUseName column if they choose to
#' include an ATTAINS use name that was not listed in prior ATTAINS assessment cycles as:
#' 'Use name is not listed as a prior cause in ATTAINS for this organization' or
#' 'Use name is listed as a prior cause in this organization, but not for this parameter name'.
#'
#' Users will have the flexibility to include the EPA304a criteria by including
#' the string 'EPA304a' in the org_id function argument.Users who only want to
#' review data against the EPA304a criteria can enter: org_id = "EPA304a".
#'
#' Users who want both their organization and and the EPA304a criteria
#' can input a vector such as: org_id = c("EPA304a", "UTAHDWQ").
#'
#' NOTE: The EPA304a criteria are not a part of ATTAINS. This information is
#' brought in from EPA's Criteria Search Tool (CST):
#' www.epa.gov/wqs-tech/state-specific-water-quality-standards-effective-under-clean-water-act-cwa.
#' The TADA Team has crosswalked the CST pollutant names with
#' TADA.ComparableDataIdentifier(s) to make the criteria values available for
#' use within TADA functions. The use_name(s) associated with the EPA304a
#' criteria are included from the CST. All other use_name(s) are specific to an
#' ATTAINS organization and come from the ATTAINS domain value for use_name.
#'
#' @param .data A TADA dataframe. The user should run all desired data cleaning,
#' processing, harmonization, filtering, and handling of censored data functions
#' prior to running TADA_CreateParamRef.
#'
#' @param org_id The ATTAINS organization identifier must be supplied by the
#' user. A list of organization identifiers can be found by downloading
#' the ATTAINS Domains Excel file:
#' https://www.epa.gov/system/files/other-files/2025-02/domains_2025-02-25.xlsx.
#' organization identifiers are listed in the "OrgName" tab.
#' The "code" column contains the organization identifiers that
#' should be used for this param. If a user does not provide an org_id argument,
#' the function attempts to identify which organization identifier(s) to include
#' based on the unique ATTAINS organization identifiers found in the dataframe.
#'
#' @param excel A Boolean value that returns an excel spreadsheet if
#' excel = TRUE. This spreadsheet is created in the user's downloads folder path.
#' If you have any trouble locating the file, please type the following into
#' your R console to locate it: file.path(Sys.getenv("USERPROFILE"), "Downloads").
#' The file will be named "myfileRef.xlsx". The excel spreadsheet will highlight
#' the cells in which users should input information. Users may need to insert
#' additional rows if:
#' 1) ATTAINS.ParameterName(s) correspond with multiple TADA.ComparableDataIdentifier(s)
#'    Example: An org uses "ALUMINUM" for all aluminum related parameter causes,
#'    but this ATTAINS.parameter name may crosswalk to "ALUMINUM_TOTAL_NA_UG/L"
#'    for one designated use and "ALUMINUM_DISSOLVED_NA_UG/L" for another; or
#' 2) TADA.ComparableDataIdentifier(s) are matched with multiple
#'    ATTAINS.ParameterNames. Example: An org uses both "pH, HIGH" and "pH, LOW"
#'    as ATTAINS.ParameterNames, both crosswalk to the
#'    TADA.ComparableDataIdentifier "PH_NA_NA_STD UNITS".
#'
#' @param overwrite A Boolean value that ensures the function will not overwrite
#' the user supplied crosswalk entered into this function via the paramRef
#' function input. This helps prevent users from overwriting their progress.
#'
#' @param paramRef A dataframe which contains a completed crosswalk between
#' TADA_ComparableDataIdentifier and ATTAINS.ParameterName. Users will need to
#' ensure this crosswalk contains the appropriate column names in order to
#' run the function. paramRef must contain at least these two column names:
#' TADA.ComparableDataIdentifier and ATTAINS.ParameterName. Users who are
#' interested in performing analyses for more than
#' one organization (multiple states or tribes, or a single state/tribe and
#' EPA 304a criteria) also need to include an additional column name:
#' 'organization_identifier'.
#'
#' @param paramUseRef A dataframe which contains a completed crosswalk of
#' organization specific use_name(s) for each ATTAINS.ParameterName.
#' Users will need to ensure this crosswalk contains the appropriate column
#' names in order to  run the function. Users who have previously completed
#' this crosswalk table can re-use it and review this output for accuracy.
#'
#' @return A dataframe which contains the columns: TADA.ComparableDataIdentifier,
#' organization_identifier, EPA304A.PollutantName, ATTAINS.ParameterName,
#' and ATTAINS.FlagUseName. Users will need to review the crosswalk between
#' ATTAINS.ParameterName, use_name and TADA.ComparableDataIdentifier.
#'
#' @seealso [TADA_CreateParamRef()]
#' @seealso [TADA_GetEPACSTRef()]
#'
#' @export
#'
#' @examples
#' # First, generate and fill out a parameter crosswalk (see TADA_CreateParamRef()):
#' paramRef_UT <- TADA_CreateParamRef(Data_Nutrients_UT, org_id = "UTAHDWQ", excel = FALSE)
#' paramRef_UT2 <- dplyr::mutate(paramRef_UT, ATTAINS.ParameterName = dplyr::case_when(
#'   grepl("AMMONIA", TADA.ComparableDataIdentifier) ~ "AMMONIA, TOTAL",
#'   grepl("NITRATE", TADA.ComparableDataIdentifier) ~ "NITRATE",
#'   grepl("NITROGEN", TADA.ComparableDataIdentifier) ~ "NITRATE/NITRITE (NITRITE + NITRATE AS N)"
#' ))
#' paramRef_UT3 <- TADA_CreateParamRef(
#'   Data_Nutrients_UT,
#'   paramRef = paramRef_UT2, org_id = "UTAHDWQ", excel = FALSE
#' )
#'
#' # Next, enter the crosswalk generated above as the paramRef function input
#' # for TADA_CreateUseParamRef():
#' paramUseRef_UT <- TADA_CreateUseParamRef(
#'   Data_Nutrients_UT,
#'   paramRef = paramRef_UT3, org_id = c("UTAHDWQ"), excel = FALSE
#' )
#'
#' # Users can include the EPA304a criteria by itself or in addition to their org(s)
#' paramUseRef_UT2 <- TADA_CreateUseParamRef(
#'   Data_Nutrients_UT,
#'   paramRef = paramRef_UT3,
#'   org_id = c("EPA304a", "UTAHDWQ"), excel = FALSE
#' )
#'
#' paramUseRef_UT3 <- TADA_CreateUseParamRef(
#'   Data_Nutrients_UT,
#'   paramRef = paramRef_UT3,
#'   org_id = c("EPA304a"), excel = FALSE
#' )
#'
TADA_CreateUseParamRef <- function(.data, org_id = NULL, paramRef = NULL,
                                   paramUseRef = NULL, excel = FALSE, overwrite = FALSE) {
  # overwrite argument should only be used when creating an excel file.
  if (excel == FALSE && overwrite == TRUE) {
    stop(paste0(
      "argument input excel = FALSE and overwrite = TRUE is an invalid combination.",
      "Cannot overwrite the excel generated spreadsheet if a user specifies excel = FALSE"
    ))
  }

  # Checks if paramRef argument contains a dataframe and necessary columns to proceed.
  if (is.null(paramRef)) {
    stop(paste0(
      "TADA.CreateUseParamRef: No paramRef argument provided."
    ))
  }

  # If org_id argument is not provided, this will attempt to pull in org_id from TADA_GetATTAINS.
  if (is.null(org_id)) {
    print(paste0(
      "TADA.CreateUseParamRef: No organization identifier(s) provided.",
      "Attempting to pull in organization identifiers found in the TADA data frame.",
      "Please ensure that you have ran TADA_GetATTAINS if you did not provide an org_id argument input."
    ))
    TADA_CheckColumns(.data, "ATTAINS.organizationname")
    org_id <- unique(stats::na.omit(.data[, "ATTAINS.organizationname"]))
  }

  # If a user does not fill in ANY values for the crosswalk of ATTAINS.ParameterName.
  # Users may want to proceed with only the EPA304a criteria crosswalk,
  # therefore we will allow users to proceed in this case.
  if (sum(!is.na(paramRef$ATTAINS.ParameterName)) == 0) {
    warning(paste0(
      "No values were found in ATTAINS.ParameterName.",
      "Please ensure that you have inputted all field values of interest in the",
      "ATTAINS.ParameterName column generated from TADA_CreateParamRef() function"
    ))
  }

  # If a user leaves at least one values for the crosswalk of ATTAINS.ParameterName blank.
  # Users are recommended to select 'No parameter match for this TADA.ComparableDataIdentifier' if
  # there is no crosswalk, but leaving it blank will be treated similarly.
  if (sum(is.na(paramRef$ATTAINS.ParameterName)) > 1) {
    print(paste0(
      "NAs were found in ATTAINS.ParameterName.",
      "Please ensure that you have inputted all field values of interest in",
      "the ATTAINS.ParameterName column generated from TADA_CreateParamRef() function"
    ))
  }

  # NOTE: For TADA Development: Do we need this? Should users be allowed to modify the EPA304a crosswalk we did?
  # For the time being I would allow this as an option, but will comment this section out for further discussions.
  # if (sum(is.na(paramRef$EPA304A.PollutantName)) > 1 && org_id == "EPA304a") {
  #   print("NAs were found in EPA304A.PollutantName. Please ensure that you have inputted all field values of interest in the EPA304A.PollutantName column generated from TADA_CreateParamRef() function if you are interested in using the 304a recommended standards")
  # }

  # check to see if user-supplied parameter ref is a df with appropriate columns and is filled out.
  if (!is.null(paramRef) & !is.character(paramRef)) {
    if (!is.data.frame(paramRef)) {
      stop(paste0(
        "TADA_CreateUseParamRef: 'paramuseRef' must be a data frame with these 2 columns:",
        "TADA.ComparableDataIdentifier and ATTAINS.ParameterName"
      ))
    }

    if (is.data.frame(paramRef)) {
      col.names <- c(
        "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName"
      )

      ref.names <- names(paramRef)

      if (length(setdiff(col.names, ref.names)) > 0 && !("TADA.ComparableDataIdentifier" %in% names(paramRef))) {
        stop(paste0(
          "TADA_CreateUseParamRef: 'paramuseRef' must be a data frame with these 2 columns:",
          "TADA.ComparableDataIdentifier and ATTAINS.ParameterName"
        ))
      }
    }
  }

  # check to see if user-supplied parameter-use ref is a df with appropriate columns and is filled out.
  if (!is.null(paramRef) & !is.character(paramRef)) {
    if (!is.data.frame(paramRef)) {
      stop(paste0(
        "TADA_CreateUseParamRef: 'paramUseRef' must be a data frame with these 6 columns:",
        "TADA.ComparableDataIdentifier, organization_identifier, EPA304A.PollutantName,",
        "ATTAINS.ParameterName, use_name, IncludeOrExclude"
      ))
    }

    if (is.data.frame(paramUseRef)) {
      col.names <- c(
        "TADA.ComparableDataIdentifier", "organization_identifier",
        "EPA304A.PollutantName", "ATTAINS.ParameterName", "use_name", "IncludeOrExclude"
      )

      ref.names <- names(paramUseRef)

      if (length(setdiff(col.names, ref.names)) > 0 && !("TADA.ComparableDataIdentifier" %in% names(paramUseRef))) {
        stop(paste0(
          "TADA_CreateUseParamRef: 'paramUseRef' must be a data frame with these 6 columns:",
          "TADA.ComparableDataIdentifier, organization_identifier, EPA304A.PollutantName,",
          "ATTAINS.ParameterName, use_name, IncludeOrExclude"
        ))
      }
    }
  }

  .data <- as.data.frame(.data)

  # Pulls in all domain values of parameter and use names by orgs in ATTAINS.
  ATTAINS_param_all <- utils::read.csv(system.file("extdata", "ATTAINSParamUseEntityRef.csv", package = "EPATADA"))

  if (!is.null(paramRef) & !("TADA.ComparableDataIdentifier" %in% names(paramRef))) {
    paramRef <- paramRef %>%
      dplyr::left_join(
        .data,
        by = c(
          "TADA.CharacteristicName", "TADA.MethodSpeciationName",
          "TADA.ResultSampleFractionText"
        )
      ) %>%
      dplyr::select(
        "TADA.CharacteristicName", "TADA.ComparableDataIdentifier", "organization_identifier",
        "EPA304A.PollutantName", "ATTAINS.ParameterName", "ATTAINS.FlagParameterName"
      )
  }

  # if user doesn't provide an org_id argument, the function extracts the unique org_id from TADA_GetATTAINS().
  # Users will need to have ran TADA_GetATTAINS() for this option to be allowed.
  # Selection of org_id will filter the drop down lists in future steps of creating the reference tables.
  if (is.null(org_id)) {
    stop("TADA.CreateParamUseRef: No organization identifier(s) provided.")
  }

  org_id <- as.list(org_id)

  # Checks if org_id are valid names found in ATTAINS - with the exception of "EPA304a" as that is not an ATTAINS org_id.
  if (sum(!org_id[org_id != "EPA304a"] %in% ATTAINS_param_all$organization_identifier) > 0) {
    warning(paste0(
      "TADA_CreateParamRef: ",
      "One or more organization identifiers entered by user is not found in ATTAINS. "
    ))
  }

  # Checks if org_id are found in the user supplied paramRef argument.
  if (sum(!org_id[org_id != "EPA304a"] %in% paramRef$organization_identifier) > 0) {
    warning(paste0(
      "TADA_CreateParamRef: ",
      "One or more organization identifiers entered by user is not found in your paramRef argument input.",
      "Excluding those missing organization identifier(s) from output"
    ))
  }

  # Filters the ATTAINS parameter and use names by the org_id in user supplied df.
  ATTAINS_param <- ATTAINS_param_all %>%
    dplyr::select(organization_identifier, parameter, use_name) %>%
    dplyr::filter(parameter %in% paramRef$ATTAINS.ParameterName) %>%
    dplyr::filter(organization_identifier %in% org_id)

  # Create the parameter-use reference table for validation
  CreateParamUseRef <- paramRef %>%
    dplyr::ungroup() %>%
    dplyr::left_join(
      ATTAINS_param,
      by = c("ATTAINS.ParameterName" = "parameter", "organization_identifier"),
      relationship = "many-to-many"
    ) %>%
    dplyr::select(
      TADA.ComparableDataIdentifier, organization_identifier,
      EPA304A.PollutantName, ATTAINS.ParameterName, use_name
    ) %>%
    tidyr::drop_na(ATTAINS.ParameterName) %>%
    dplyr::filter(ATTAINS.ParameterName != "No parameter match for TADA.ComparableDataIdentifier") %>%
    dplyr::distinct()

  # If users want the EPA304a criteria. This pulls in the CST reference file.
  # Extracts the associated EPA304a pollutant names and its use_names.
  if ("EPA304a" %in% org_id) {
    CST_param <- utils::read.csv(system.file("extdata", "CST.csv", package = "EPATADA")) %>%
      dplyr::select(EPA304A.PollutantName = POLLUTANT_NAME, use_name) %>%
      dplyr::mutate(organization_identifier = "EPA304a")

    EPA_param <- CreateParamUseRef %>%
      dplyr::left_join(CST_param, c("EPA304A.PollutantName"), relationship = "many-to-many") %>%
      dplyr::select(
        TADA.ComparableDataIdentifier,
        organization_identifier = organization_identifier.y,
        ATTAINS.ParameterName, EPA304A.PollutantName, use_name = use_name.y
      ) %>%
      dplyr::distinct()

    # remove intermediate object CST_param
    rm(CST_param)

    CreateParamUseRef <- CreateParamUseRef %>%
      dplyr::ungroup() %>%
      dplyr::full_join(
        EPA_param,
        by = c(
          "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName",
          "organization_identifier", "EPA304A.PollutantName", "use_name"
        )
      ) %>%
      dplyr::select(
        TADA.ComparableDataIdentifier, organization_identifier,
        EPA304A.PollutantName, ATTAINS.ParameterName, use_name
      ) %>%
      dplyr::filter(organization_identifier %in% org_id)

    # remove intermediate object EPA_param
    rm(EPA_param)
  }

  # If a user provide a paramUseRef argument, this will
  if (!is.null(paramUseRef)) {
    CreateParamUseRef <- paramUseRef %>%
      dplyr::left_join(
        CreateParamUseRef,
        by = c(
          "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName",
          "organization_identifier", "EPA304A.PollutantName", "use_name"
        )
      ) %>%
      dplyr::arrange(use_name) %>%
      dplyr::select(
        TADA.ComparableDataIdentifier, organization_identifier,
        EPA304A.PollutantName, ATTAINS.ParameterName, use_name
      ) %>%
      dplyr::filter(organization_identifier %in% org_id)
  }

  # This updates the flagging column. Users who only creates an R dataframe in the R environment will need to
  # ensure they re-run the function with their completed paramRef as an input to reflect this column accurately.
  Flag1 <- CreateParamUseRef %>%
    dplyr::anti_join(
      ATTAINS_param_all,
      by = c("use_name", "organization_identifier")
    ) %>%
    dplyr::select(use_name, organization_identifier) %>%
    dplyr::distinct() %>%
    dplyr::mutate(
      ATTAINS.FlagUseName1 = "Use name is listed as a prior cause in this organization, but not for this parameter name"
    )

  Flag2 <- CreateParamUseRef %>%
    dplyr::anti_join(
      ATTAINS_param_all,
      by = c("ATTAINS.ParameterName" = "parameter", "use_name", "organization_identifier")
    ) %>%
    dplyr::select(use_name, ATTAINS.ParameterName, organization_identifier) %>%
    dplyr::distinct() %>%
    dplyr::mutate(
      ATTAINS.FlagUseName2 = "Use name is not listed as a prior cause in ATTAINS for this organization"
    )

  CreateParamUseRef <- CreateParamUseRef %>%
    dplyr::left_join(Flag2, c("ATTAINS.ParameterName", "use_name", "organization_identifier")) %>%
    dplyr::left_join(Flag1, c("use_name", "organization_identifier")) %>%
    dplyr::mutate(IncludeOrExclude = "Include") %>%
    dplyr::mutate(ATTAINS.FlagUseName = dplyr::case_when(
      IncludeOrExclude == "Exclude" ~
        "Use name does not apply for this ATTAINS.ParameterName. Excluding this use name from analysis.",
      is.na(use_name) ~
        "No use name is provided. Consider choosing an appropriate use_name.",
      organization_identifier == "EPA304a" ~
        "Will use the EPA304a recommended standards for this parameter. Do not edit EPA304a use_name",
      organization_identifier != "EPA304a" ~
        "Use name is listed as prior cause in ATTAINS for this org",
      !is.na(ATTAINS.FlagUseName2) ~ ATTAINS.FlagUseName2,
      !is.na(ATTAINS.FlagUseName1) ~ ATTAINS.FlagUseName1,
    )) %>%
    dplyr::arrange(use_name) %>%
    dplyr::select(
      TADA.ComparableDataIdentifier, organization_identifier, EPA304A.PollutantName,
      ATTAINS.ParameterName, use_name, IncludeOrExclude, ATTAINS.FlagUseName
    )

  # remove intermediate objects
  rm(ATTAINS_param, Flag1, Flag2)

  downloads_path <- file.path(Sys.getenv("USERPROFILE"), "Downloads", "myfileRef.xlsx")

  if (excel == TRUE) {
    # Create column names for an empty dataframe
    columns <- c(
      "organization_identifier", "ATTAINS.ParameterName", "use_name",
      "ATTAINS.FlagParameterName", "ATTAINS.FlagUseName"
    )

    # empty dataframe with just column names
    par <- data.frame(matrix(nrow = 0, ncol = length(columns)))
    colnames(par) <- columns

    wb <- openxlsx::loadWorkbook(wb, downloads_path)

    # If a user chooses to rerun the TADA_CreateUseParamRef() function,
    # the sheet will already exist and error.
    tryCatch(
      {
        openxlsx::addWorksheet(wb, "CreateParamUseRef")
      },
      error = function(e) {
        openxlsx::removeWorksheet(wb, "CreateParamUseRef")
        openxlsx::addWorksheet(wb, "CreateParamUseRef")
      }
    )

    # If a user performs TADA_CreateParamRef() with excel = FALSE, but decides to perform TADA_CreateUseParamRef() with excel = TRUE,
    # and if the file already exists, we need to consider how to display the excel tab for [CreateParamRef] to ensure it does not display
    # non-matching dataframe crosswalk to avoid confusion.
    # excel_paramRef <- openxlsx::read.xlsx(downloads_path, sheet = "CreateParamRef")[,c("TADA.CharacteristicName", "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName", "organization_identifier")]
    #
    # if(nrow(dplyr::inner_join(
    #   paramRef[,c("TADA.CharacteristicName", "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName", "organization_identifier")],
    #   excel_paramRef[,c("TADA.CharacteristicName", "TADA.ComparableDataIdentifier", "ATTAINS.ParameterName", "organization_identifier")]))
    #   !=nrow(excel_paramRef)){
    #     if(nrow(excel_paramRef)!=nrow(paramRef)){
    #       warning("Your user-supplied paramRef table does not match the parameter reference table in excel sheet [CreateParamRef]. This may have occured if you have previously ran TADA_CreateParamRef() with excel = TRUE on a previous dataframe and proceeded with running TADA_CreateParamRef() with excel = FALSE and TADACreateParamUseRef() with excel = TRUE")
    #     }
    # }

    # set zoom size
    set_zoom <- function(x) gsub('(?<=zoomScale=")[0-9]+', x, sV, perl = TRUE)
    n_sheets <- length(wb$worksheets)
    for (i in 1:n_sheets) {
      sV <- wb$worksheets[[i]]$sheetViews
      wb$worksheets[[i]]$sheetViews <- set_zoom(90)
    }

    # sV <- wb$worksheets[[2]]$sheetViews
    # wb$worksheets[[2]]$sheetViews <- set_zoom(90)
    # sV <- wb$worksheets[[3]]$sheetViews
    # wb$worksheets[[3]]$sheetViews <- set_zoom(90)
    # sV <- wb$worksheets[[4]]$sheetViews
    # wb$worksheets[[4]]$sheetViews <- set_zoom(90)

    # Format column header
    header_st <- openxlsx::createStyle(textDecoration = "Bold")
    # Format Column widths
    openxlsx::setColWidths(wb, "CreateParamUseRef", cols = 1:ncol(CreateParamUseRef), widths = "auto")

    # Export CreateParamUseRef dataframe into the excel spreadsheet tab
    openxlsx::writeData(wb, "CreateParamUseRef", startCol = 1, x = CreateParamUseRef, headerStyle = header_st)

    # Index of allowable values for drop-down lists
    openxlsx::writeData(wb, "Index", startCol = 2, x = data.frame("IncludeOrExclude" = c("Include", "Exclude")))

    # data validation drop down list created below.
    # Note: ATTAINSOrgNamesParamRef contains the list of prior param and use cause by org names specific.
    # Since Use Names are individual to each Organization.
    suppressWarnings(
      openxlsx::dataValidation(
        wb,
        sheet = "CreateParamUseRef",
        cols = 5, rows = 2:1000,
        type = "list",
        value = sprintf("'ATTAINSOrgNamesParamRef'!$D$2:$D$50000"),
        allowBlank = TRUE, showErrorMsg = TRUE, showInputMsg = TRUE
      )
    )

    suppressWarnings(
      openxlsx::dataValidation(
        wb,
        sheet = "CreateParamUseRef",
        cols = 6, rows = 2:1000,
        type = "list",
        value = sprintf("'Index'!$B$2:$B$5"),
        allowBlank = TRUE, showErrorMsg = TRUE, showInputMsg = TRUE
      )
    )

    for (i in 1:nrow(CreateParamUseRef)) {
      openxlsx::conditionalFormatting(
        wb, "CreateParamUseRef",
        cols = 5, rows = i + 1,
        type = "blanks",
        style = openxlsx::createStyle(bgFill = TADA_ColorPalette()[13])
      )

      openxlsx::conditionalFormatting(
        wb, "CreateParamUseRef",
        cols = 6, rows = i + 1,
        type = "contains",
        rule = c("Exclude"),
        style = openxlsx::createStyle(bgFill = TADA_ColorPalette()[13])
      )

      openxlsx::conditionalFormatting(
        wb, "CreateParamUseRef",
        cols = 6, rows = i + 1,
        type = "contains",
        rule = c("Include"),
        style = openxlsx::createStyle(bgFill = TADA_ColorPalette()[8])
      )

      openxlsx::writeFormula(
        wb, "CreateParamUseRef",
        startCol = 7, startRow = i + 1,
        array = TRUE,
        x = paste0(
          "=IF(B", i + 1, '="EPA304a",
            "Will use the EPA304a recommended standards for this parameter. Do not edit EPA304a use_name",
          IF(F', i + 1, '="Exclude",
            "Use name does not apply for this ATTAINS.ParameterName. Exclusing this use name from analysis.",
          IF(ISBLANK(E', i + 1, '),
            "No use name is provided. Consider choosing an appropriate use_name.",
          IF(ISNA(MATCH(1,(E', i + 1, "=Index!G:G)*(B", i + 1, '=Index!D:D),0)),
            "Use name is not listed as a prior cause in ATTAINS for this organization",
          IF(ISNA(MATCH(1,(D', i + 1, "=Index!H:H)*(E", i + 1, "=Index!G:G)*(B", i + 1, '=Index!D:D),0)),
            "Use name is listed as a prior cause in this organization, but not for this parameter name",
            "Use name is listed as prior cause in ATTAINS for this org")))))'
        )
      )
    }

    # Handles overwriting the excel file.
    if (overwrite == TRUE) {
      openxlsx::saveWorkbook(wb, downloads_path, overwrite = T)
    }

    if (overwrite == FALSE) {
      warning(
        "If you would like to replace [CreateParamUseRef], use overwrite = TRUE argument in TADA_CreateUseParamRef"
      )
      openxlsx::saveWorkbook(wb, downloads_path, overwrite = F)
    }

    cat("File saved to:", gsub("/", "\\\\", downloads_path), "\n")
  }
  return(CreateParamUseRef)
}
USEPA/TADA documentation built on April 12, 2025, 1:47 p.m.