TADA: EPA TADA (Tools for Automated Data Analysis) R Package

Documented in TADA_CalculateTotalNP TADA_HarmonizeSynonyms

#' Harmonize Synonyms
#'
#' This function joins a synonym reference table to the dataset to convert
#' synonymous data to a unified naming format for easier aggregation, analysis,
#' and visualization. Users may populate the function with a dataset-specific
#' synonym table created from TADA_GetSynonymRef and reviewed/customized by the
#' user (recommended), or the default TADA-provided synonym table, containing
#' suggested synonym naming for some priority characteristics. Where a suggested
#' characteristic name, fraction, speciation, or unit is present, the function
#' will convert the TADA.CharacteristicName, TADA.ResultSampleFractionText,
#' and TADA.MethodSpeciationName to the target format. In cases where a target
#' speciation differs from the existing speciation, the reference table will
#' also apply multiplication conversion factors to the TADA.ResultMeasureValue.
#'
#' @param .data TADA dataframe
#' @param ref Optional argument to specify which dataframe to use as a reference
#' file. The primary use for this argument is when a user has generated a
#' synonym reference file unique to their data, and they made changes to
#' that file.
#' @param np_speciation Boolean. Determines whether the user wants to convert
#'   nitrogen and phosphorus subspecies to speciation 'as N' and 'as P', where
#'   speciation conversions are provided. Defaults to TRUE. For example, if
#'   np_speciation is TRUE, all Nitrate with TADA.MethodSpeciationName = as
#'   NO3 will be converted to as N using molecular weight conversion factors.
#'
#' @return The input TADA dataframe with the TADA.CharacteristicName,
#'   TADA.ResultSampleFractionText, TADA.MethodSpeciationName, and
#'   TADA.ResultMeasure.MeasureUnitCode columns converted to the target values,
#'   if supplied. Also includes additional columns
#'   TADA.CharacteristicNameAssumptions, TADA.FractionAssumptions, and
#'   TADA.SpeciationAssumptions populated with additional notes about the conversion
#'   logic, and a TADA.Harmonized.Flag, indicating whether TADA columns were
#'   changed in this function.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Load example dataset:
#' data(Data_6Tribes_5y)
#'
#' # Create a synonym reference table for flagged, cleaned dataframe:
#' Data_6Tribes_5yClean <- subset(
#'   Data_6Tribes_5y,
#'   !is.na(Data_6Tribes_5y$TADA.ResultMeasureValue)
#' )
#' Data_6Tribes_5yClean <- TADA_FlagFraction(Data_6Tribes_5yClean,
#'   clean = TRUE
#' )
#' Data_6Tribes_5yClean <- TADA_FlagResultUnit(Data_6Tribes_5yClean,
#'   clean = "suspect_only"
#' )
#' Data_6Tribes_5yClean <- TADA_FlagSpeciation(Data_6Tribes_5yClean,
#'   clean = "suspect_only"
#' )
#' Data_6Tribes_5yClean <- TADA_FlagMethod(Data_6Tribes_5yClean, clean = TRUE)
#' CreateRefTable <- TADA_GetSynonymRef(Data_6Tribes_5yClean)
#'
#' # Append synonym reference table columns to dataframe and transform/convert
#' # data to the USER SUPPLIED reference table values:
#' Data_6Tribes_5yClean_Harmonized <-
#'   TADA_HarmonizeSynonyms(Data_6Tribes_5yClean, ref = CreateRefTable)
#' }
#'
TADA_HarmonizeSynonyms <- function(.data, ref, np_speciation = TRUE) {
  # check .data is data.frame
  TADA_CheckType(.data, "data.frame", "Input object")

  # check .data has the required columns
  expected_cols <- c(
    "TADA.CharacteristicName",
    "TADA.ResultSampleFractionText",
    "TADA.MethodSpeciationName"
  )
  TADA_CheckColumns(.data, expected_cols)

  # define which columns are expected in ref
  expected_ref_cols <- c(
    "TADA.CharacteristicName",
    "Target.TADA.CharacteristicName",
    "TADA.CharacteristicNameAssumptions",
    "TADA.ResultSampleFractionText",
    "Target.TADA.ResultSampleFractionText",
    "TADA.FractionAssumptions",
    "TADA.MethodSpeciationName",
    "Target.TADA.MethodSpeciationName",
    "TADA.SpeciationAssumptions",
    "Target.TADA.SpeciationConversionFactor",
    "HarmonizationGroup"
  )

  # if class(ResultMeasureValue) != numeric, run special char function - EDH - should not be needed at this point but doesn't hurt.
  if (!is.numeric(.data$TADA.ResultMeasureValue)) {
    stop("TADA.ResultMeasureValue is not numeric. This column must be numeric before proceeding.")
  }

  # Change NONE in unit, fraction, and speciation to NA for better harmonization
  .data <- .data %>% dplyr::mutate(
    TADA.ResultSampleFractionText = replace(TADA.ResultSampleFractionText, TADA.ResultSampleFractionText %in% c("NONE"), NA),
    TADA.MethodSpeciationName = replace(TADA.MethodSpeciationName, TADA.MethodSpeciationName %in% c("NONE"), NA),
    TADA.ResultMeasure.MeasureUnitCode = replace(TADA.ResultMeasure.MeasureUnitCode, TADA.ResultMeasure.MeasureUnitCode %in% c("NONE"), NA)
  )

  # define harm.ref
  # if input for ref exists, use that data
  if (!missing(ref)) {
    # check ref is data.frame
    TADA_CheckType(ref, "data.frame")

    # check ref has all of the required columns
    TADA_CheckColumns(ref, expected_ref_cols)

    harm.ref <- ref
  }

  # if input for ref does not exist, use raw harmonization template
  if (missing(ref)) {
    # use output of TADA_GetSynonymRef which uses the TADA HarmonizationTemplate.csv in the extdata folder
    harm.ref <- TADA_GetSynonymRef(.data) %>%
      dplyr::distinct()
  }

  # find places where metadata will be changed and add targets
  harm.ref$TADA.Harmonized.Flag <- ifelse(!is.na(harm.ref$Target.TADA.CharacteristicName) |
    !is.na(harm.ref$Target.TADA.ResultSampleFractionText) |
    !is.na(harm.ref$Target.TADA.MethodSpeciationName),
  TRUE, FALSE
  )

  .data <- .data[, !names(.data) %in% c("TADA.ComparableDataIdentifier")]

  # join harm.ref to .data
  flag.data <- .data %>%
    dplyr::left_join(harm.ref, by = c(
      "TADA.CharacteristicName",
      "TADA.ResultSampleFractionText",
      "TADA.MethodSpeciationName"
    ))

  # TADA.CharacteristicName
  # replace TADA.CharacteristicName with Target.TADA.CharacteristicName
  clean.data <- flag.data %>%
    # use TADA suggested name where there is a suggested name, use original name if no suggested name
    dplyr::mutate(TADA.CharacteristicName = dplyr::case_when(
      !is.na(Target.TADA.CharacteristicName) ~ Target.TADA.CharacteristicName,
      # is.na(Target.TADA.CharacteristicName) ~ TADA.CharacteristicName,
      .default = TADA.CharacteristicName
    ))

  # TADA.ResultSampleFractionText
  # replace ResultSampleFractionText with Target.TADA.ResultSampleFractionText
  clean.data <- clean.data %>%
    # use TADA suggested frac where there is a suggested frac, use original frac if no suggested frac
    dplyr::mutate(TADA.ResultSampleFractionText = dplyr::case_when(
      !is.na(Target.TADA.ResultSampleFractionText) ~ Target.TADA.ResultSampleFractionText,
      !is.na(TADA.ResultSampleFractionText) & is.na(Target.TADA.ResultSampleFractionText) & !is.na(TADA.FractionAssumptions) ~ Target.TADA.ResultSampleFractionText,
      # is.na(Target.TADA.ResultSampleFractionText) ~ TADA.ResultSampleFractionText
      .default = TADA.ResultSampleFractionText
    ))

  # Handle instances with DO where the speciation is listed "AS O2" but it should be NA
  clean.data$TADA.MethodSpeciationName <- ifelse(!is.na(clean.data$TADA.MethodSpeciationName) & is.na(clean.data$Target.TADA.MethodSpeciationName) & !is.na(clean.data$TADA.SpeciationAssumptions), clean.data$Target.TADA.MethodSpeciationName, clean.data$TADA.MethodSpeciationName)

  # TADA.MethodSpeciationName
  # replace MethodSpeciationName with Target.TADA.MethodSpeciationName

  if (np_speciation == TRUE) {
    clean.data <- clean.data %>%
      # use TADA suggested spec where there is a suggested spec, use original spec if no suggested spec
      dplyr::mutate(TADA.MethodSpeciationName = dplyr::case_when(
        !is.na(Target.TADA.MethodSpeciationName) ~ Target.TADA.MethodSpeciationName,
        # is.na(Target.TADA.MethodSpeciationName) ~ TADA.MethodSpeciationName
        .default = TADA.MethodSpeciationName
      )) %>%
      # if conversion factor exists, multiply by ResultMeasureValue
      dplyr::rowwise() %>%
      dplyr::mutate(TADA.ResultMeasureValue = dplyr::case_when(
        !is.na(Target.TADA.SpeciationConversionFactor) ~
          (Target.TADA.SpeciationConversionFactor * TADA.ResultMeasureValue),
        # is.na(Target.TADA.SpeciationConversionFactor) ~ TADA.ResultMeasureValue
        .default = TADA.ResultMeasureValue
      ))
  } else {
    clean.data <- clean.data %>%
      # use TADA suggested spec where there is a suggested spec, use original spec if no suggested spec
      dplyr::mutate(TADA.MethodSpeciationName = dplyr::case_when(
        !is.na(Target.TADA.MethodSpeciationName) & is.na(Target.TADA.SpeciationConversionFactor) ~ Target.TADA.MethodSpeciationName,
        # is.na(Target.TADA.MethodSpeciationName) ~ TADA.MethodSpeciationName
        .default = TADA.MethodSpeciationName
      ))
  }

  # remove conversion columns
  clean.data <- clean.data %>%
    dplyr::select(-c(
      "Target.TADA.CharacteristicName",
      "Target.TADA.ResultSampleFractionText",
      "Target.TADA.MethodSpeciationName",
      "Target.TADA.SpeciationConversionFactor",
      "HarmonizationGroup"
    ))

  clean.data$TADA.Harmonized.Flag <- ifelse(is.na(clean.data$TADA.Harmonized.Flag), FALSE, clean.data$TADA.Harmonized.Flag)

  # return clean.data
  clean.data <- TADA_CreateComparableID(clean.data)
  clean.data <- TADA_OrderCols(clean.data)
  return(clean.data)
}


#' Calculate Total Nitrogen and Phosphorus
#'
#' This function applies the [Nutrient Aggregation logic](https://echo.epa.gov/trends/loading-tool/resources/nutrient-aggregation#nitrogen)
#' from ECHO's Water Pollutant Loading Tool to add nitrogen subspecies together
#' to approximate a total nitrogen value on a single day at a single site.
#' Before summing subspecies, this function runs TADA_AggregateMeasurements to
#' obtain the max value of a characteristic-fraction-speciation at a given site,
#' date, and depth. Where necessary, it uses conversion factors to convert
#' nitrogen subspecies expressed as nitrate, nitrite, ammonia, ammonium, etc. to
#' as nitrogen based on the atomic weights of the different elements in the
#' compound. The reference table is contained within the package but may be
#' edited/customized by users. Nutrient equations are as follows:
#'
#' NITROGEN:
#' 1. TOTAL N (UNFILTERED)
#' 2. TOTAL N (FILTERED) + TOTAL N (PARTICULATE)
#' 3. TOTAL KJELDAHL NITROGEN + NITRATE + NITRITE
#' 4. ORGANIC N + AMMONIA + NITRATE + NITRITE
#' 5. OTHER NITROGEN FORMS
#'
#' PHOSPHORUS:
#' 1. TOTAL PHOSPHORUS
#' 2. PHOSPHATE
#' 3. OTHER PHOSPHORUS FORMS
#'
#' Equations are applied in the order above. The function looks for groups of
#' nutrients that exactly match each equation before looking for every
#' combination within each equation (for example, a group of nitrogen subspecies
#' including AMMONIA and NITRATE will be passed over in an initial sweep of
#' groups of subspecies containing ORG N, AMMONIA, NITRATE, and NITRITE, but
#' will be caught as the function moves down the hierarchy of equations to fewer
#' and fewer subspecies). Eventually, even groups with only one subspecies will
#' be used to represent a TOTAL N value for that site/day/depth.
#'
#' @param .data TADA dataframe, ideally harmonized using TADA_HarmonizeSynonyms.
#'   If user wants to consider grouping N or P subspecies across multiple
#'   organizations, user should have run TADA_FindNearbySites and grouped all
#'   nearby sites to one common TADA.MonitoringLocationIdentifier,
#'   TADA.LatitudeMeasure, TADA.LongitudeMeasure, etc.
#' @param sum_ref Optional. A custom summation reference dataframe the user has
#'   loaded into the R environment. Dataframe must have same columns as default
#'   TADA.summation reference table.
#' @param daily_agg The function used to aggregate to a single
#'   characteristic-unit-fraction-speciation at the same location and depth on
#'   the same day for multiple measurements. Defaults to 'max', but can be set
#'   to 'min' or 'mean'.
#'
#' @return Input TADA dataframe with additional rows representing total N and P
#'   summation values from adding up subspecies. Note that for total phosphorus,
#'   these additional rows are simply a re-classification of phosphorus or
#'   phosphate into the total phosphorus as P format. These new rows share the
#'   same date and monitoring location as the subspecies, but an additional note
#'   is added in the TADA.NutrientSummation.Flag column describing how the total
#'   was derived. Also adds TADA.NutrientSummationGroup and
#'   TADA.NutrientSummationEquation columns, which can be used to trace how the
#'   total was calculated and from which subspecies.
#'
#' @export

TADA_CalculateTotalNP <- function(.data, sum_ref, daily_agg = c("max", "min", "mean")) {
  # check to make sure daily_agg is populated with allowable value
  daily_agg <- match.arg(daily_agg)

  # check required columns for TADA dataset
  req_cols <- c(
    "TADA.CharacteristicName",
    "TADA.ResultSampleFractionText",
    "TADA.MethodSpeciationName",
    "TADA.ResultMeasure.MeasureUnitCode",
    "TADA.ResultMeasureValue",
    "ActivityStartDate",
    "TADA.MonitoringLocationIdentifier",
    "ActivityTypeCode"
  )
  TADA_CheckColumns(.data, expected_cols = req_cols)

  # bring in custom reference df if provided
  if (!missing(sum_ref)) {
    ref_cols <- names(TADA_GetNutrientSummationRef())
    TADA_CheckColumns(sum_ref, expected_cols = ref_cols)
  } else {
    sum_ref <- TADA_GetNutrientSummationRef()
  }

  # check if QC flag function run and print warning if not
  if (!"TADA.ActivityType.Flag" %in% names(.data)) {
    "Warning: TADA dataset does not have the TADA.ActivityType.Flag column, which indicates QC replicates have not been handled/reviewed. This function is not built to handle QC replicate samples and will use them to aggregate to a daily max and total nutrient value."
  }

  # Get grouping cols for daily aggregation
  # create nutrient groups by site, date, and depth
  depths <- names(.data)[grepl("DepthHeightMeasure", names(.data))]
  depths <- depths[grepl("TADA.", depths)]
  grpcols <- c(
    "ActivityStartDate",
    # "ActivityStartDateTime", #does not make sense to include for daily agg
    "ActivityRelativeDepthName",
    "TADA.MonitoringLocationIdentifier",
    "MonitoringLocationName",
    "TADA.LongitudeMeasure",
    "TADA.LatitudeMeasure",
    "ActivityMediaSubdivisionName",
    "TADA.ActivityMediaName",
    "TADA.ComparableDataIdentifier",
    "TADA.ResultMeasure.MeasureUnitCode",
    depths
  )

  dat <- suppressMessages(TADA_AggregateMeasurements(.data, grouping_cols = grpcols, agg_fun = daily_agg, clean = TRUE))

  # join data to summation table and keep only those that match for summations
  sum_dat <- merge(dat, sum_ref, all.x = TRUE)
  sum_dat <- subset(sum_dat, !is.na(sum_dat$NutrientGroup))

  ## REMINDER FOR TADA TEAM: NEED TO ENSURE ALL COMBOS PRESENT IN TABLE

  # If the join results in matching rows
  if (dim(sum_dat)[1] > 0) {
    thecols <- grpcols[!grpcols %in% c("TADA.ComparableDataIdentifier")]

    # # find nearby sites
    # nearsites = unique(sum_dat[,c("TADA.MonitoringLocationIdentifier","TADA.LatitudeMeasure","TADA.LongitudeMeasure")])
    # nearsites = TADA_FindNearbySites(nearsites)
    # nearsites = subset(nearsites, !nearsites$TADA.NearbySiteGroups%in%c("No nearby sites"))

    # create nutrient group ID's.
    sum_dat <- sum_dat %>%
      dplyr::group_by(dplyr::across(dplyr::all_of(thecols))) %>%
      dplyr::mutate(TADA.NutrientSummationGroup = dplyr::cur_group_id())

    # bring in equations
    eqns <- utils::read.csv(system.file("extdata", "NP_equations.csv", package = "EPATADA"))


    # dataframe to hold results
    summeddata <- data.frame()
    grps <- vector()

    for (i in 1:length(unique(eqns$Nutrient))) {
      nut <- unique(eqns$Nutrient)[i]
      nutqns <- subset(eqns, eqns$Nutrient == nut)
      for (j in 1:length(unique(nutqns$EQN))) {
        eqnum <- unique(nutqns$EQN)[j]
        eqn <- subset(nutqns, nutqns$EQN == eqnum)$SummationName
        nutrient <- ifelse(nut == "N", "Total Nitrogen as N", "Total Phosphorus as P")
        # for each equation, see if any groups contain all required subspecies, and for each pick the variant with the lowest rank.
        # combine group with other groups and remove group ID from consideration for the next equation
        out <- sum_dat %>%
          dplyr::filter(!TADA.NutrientSummationGroup %in% grps) %>%
          dplyr::group_by(TADA.NutrientSummationGroup) %>%
          dplyr::filter(all(eqn %in% SummationName)) %>% # this line ensures that ALL subspecies are present within an equation group, not just one or more
          dplyr::filter(SummationName %in% eqn) %>%
          dplyr::mutate(TADA.NutrientSummationEquation = paste0(unique(SummationName), collapse = " + "))

        out <- out %>%
          dplyr::group_by(TADA.NutrientSummationGroup, SummationName) %>%
          dplyr::slice_min(SummationRank, with_ties = FALSE)
        out$TADA.NutrientSummation.Flag <- paste0("Used to calculate ", nutrient, ".")
        out$nutrient <- nutrient
        summeddata <- plyr::rbind.fill(summeddata, out)
        grps <- c(grps, unique(out$TADA.NutrientSummationGroup))
      }
    }

    # check to make sure group isn't entirely non-detects


    # Convert speciation if needed
    summeddata$TADA.ResultMeasureValue <- ifelse(!is.na(summeddata$SummationSpeciationConversionFactor), summeddata$TADA.ResultMeasureValue * summeddata$SummationSpeciationConversionFactor, summeddata$TADA.ResultMeasureValue)
    summeddata$TADA.MethodSpeciationName <- ifelse(!is.na(summeddata$SummationSpeciationConversionFactor) & summeddata$nutrient == "Total Nitrogen as N", "AS N", summeddata$TADA.MethodSpeciationName)
    summeddata$TADA.MethodSpeciationName <- ifelse(!is.na(summeddata$SummationSpeciationConversionFactor) & summeddata$nutrient == "Total Phosphorus as P", "AS P", summeddata$TADA.MethodSpeciationName)

    # Get to total N or P

    totncols <- c(thecols, "TADA.NutrientSummationGroup", "TADA.NutrientSummationEquation")
    TotalN <- summeddata %>%
      dplyr::filter(nutrient == "Total Nitrogen as N") %>%
      dplyr::group_by(dplyr::across(dplyr::all_of(totncols))) %>%
      dplyr::summarise(TADA.ResultMeasureValue = sum(TADA.ResultMeasureValue)) %>%
      dplyr::mutate(TADA.CharacteristicName = "TOTAL NITROGEN, MIXED FORMS", TADA.ResultSampleFractionText = "UNFILTERED", TADA.MethodSpeciationName = "AS N", TADA.NutrientSummation.Flag = "Nutrient summation from one or more subspecies.")
    TotalP <- summeddata %>%
      dplyr::filter(nutrient == "Total Phosphorus as P") %>%
      dplyr::group_by(dplyr::across(dplyr::all_of(totncols))) %>%
      dplyr::summarise(TADA.ResultMeasureValue = sum(TADA.ResultMeasureValue)) %>%
      dplyr::mutate(TADA.CharacteristicName = "TOTAL PHOSPHORUS, MIXED FORMS", TADA.ResultSampleFractionText = "UNFILTERED", TADA.MethodSpeciationName = "AS P", TADA.NutrientSummation.Flag = "Nutrient summation from one subspecies.")

    # if summation is zero....include anyway?

    Totals <- plyr::rbind.fill(TotalN, TotalP)
    Totals$ResultIdentifier <- paste0("TADA-", sample(100000000:1000000000, dim(Totals)[1])) # give TADA ResultIdentifier

    # combine all data back into input dataset and get rid of unneeded columns
    .data <- merge(.data, summeddata, all.x = TRUE)
    .data <- plyr::rbind.fill(.data, Totals)
    .data <- .data %>% dplyr::select(-c(SummationFractionNotes, SummationSpeciationNotes, SummationSpeciationConversionFactor, SummationName, SummationRank, SummationNote, nutrient, NutrientGroup))
    .data$TADA.NutrientSummation.Flag[is.na(.data$TADA.NutrientSummation.Flag)] <- "Not used to calculate Total N or P."
  } else {
    # if there are no data to sum
    .data$TADA.NutrientSummation.Flag <- "Not used to calculate Total N or P."
    print("No Total N or P subspecies exist in dataset. Returning input dataset with TADA.NutrientSummation.Flag set to 'Not used to calculate Total N or P'")
  }

  # order columns
  .data <- TADA_CreateComparableID(.data)
  .data <- TADA_OrderCols(.data)
}