R/Utilities.R

Defines functions TADA_TableExport TADA_CharStringRemoveNA TADA_ViewColorPalette TADA_ColorPalette TADA_UniqueCharUnitSpeciation TADA_addPoints TADA_addPolys getPopup getLayer writeLayer getFeatureLayer pchIcons getBboxJson TADA_AggregateMeasurements TADA_RandomTestingData TADA_GetUniqueNearbySites TADA_FindNearbySites TADA_FormatDelimitedString TADA_CreateComparableID TADA_SubstituteDeprecatedChars TADA_ConvertSpecialChars TADA_CheckColumns TADA_CheckType TADA_InsertBreaks TADA_DecimalPlaces quiet

Documented in getBboxJson getFeatureLayer getLayer getPopup pchIcons quiet TADA_addPoints TADA_addPolys TADA_AggregateMeasurements TADA_CharStringRemoveNA TADA_CheckColumns TADA_CheckType TADA_ColorPalette TADA_ConvertSpecialChars TADA_CreateComparableID TADA_DecimalPlaces TADA_FindNearbySites TADA_FormatDelimitedString TADA_GetUniqueNearbySites TADA_InsertBreaks TADA_RandomTestingData TADA_SubstituteDeprecatedChars TADA_TableExport TADA_UniqueCharUnitSpeciation TADA_ViewColorPalette writeLayer

#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
#' @export
#' @param lhs A value or the magrittr placeholder.
#' @param rhs A function call using the magrittr semantics.
#' @return The result of calling `rhs(lhs)`.
NULL

#' Silence print messages
#' @name quiet
#' @param x Code to silence
#' @return Function or code output with print messages silenced
quiet <- function(x) {
  sink(tempfile())
  on.exit(sink())
  invisible(force(x))
}

# write global variables. Gets rid of global variable NOTE in check:
utils::globalVariables(c(
  "TADA.ResultValueAboveUpperThreshold.Flag", "ActivityIdentifier", "ActivityMediaName",
  "ActivityStartDate", "TADA.ResultValueBelowUpperThreshold.Flag",
  "TADA.ResultValueBelowLowerThreshold.Flag", "CharacteristicName",
  "Conversion.Factor", "Count", "Description", "FieldName", "FieldValue",
  "MethodSpecationName", "MonitoringLocationIdentifier",
  "OrganizationFormalName", "OrganizationIdentifier", "ProjectDescriptionText",
  "ProjectFileUrl", "ProjectIdentifier",
  "ProjectMonitoringLocationWeightingUrl", "ProjectName",
  "QAPPApprovalAgencyName", "QAPPApprovedIndicator",
  "ResultDetectionConditionText", "ResultMeasureValue",
  "SamplingDesignTypeCode", "Source", "Status", "TADA.ContinuousData.Flag",
  "TADA.SuspectCoordinates.Flag", "TADA.PotentialDupRowIDs.Flag", "TADA.QAPPDocAvailable",
  "Target.Unit", "Type", "Value.Unit", "TADA.AnalyticalMethod.Flag",
  "TADA.MethodSpeciation.Flag", "TADA.ResultUnit.Flag",
  "TADA.SampleFraction.Flag", "YearSummarized", "where", "TADA.CharacteristicName",
  "ResultIdentifier", "TADA.ResultMeasureValue", "n_sites",
  "n_records", "statecodes_df", "STUSAB", "ActivityStartTime.Time", "numorgs", "dup_id",
  "LatitudeMeasure", "TADA.ResultMeasureValueDataTypes.Flag", "Name", "TADA.Detection_Type",
  "DetectionQuantitationLimitTypeName", "TADA.Limit_Type", "multiplier", "summ", "cf",
  "LongitudeMeasure", "TADA.CensoredData.Flag", "Censored_Count",
  "Status2", "ActivityTypeCode", "SampleCollectionEquipmentName",
  "ResultTimeBasisText", "StatisticalBaseCode", "ResultValueTypeName",
  "masked", "TADA.env", "Legend", "Fields", "desc", "WQXActivityType_Cached",
  "TADA.ActivityType.Flag", "Code", "ActivityTypeCode", "ResultCount",
  "tot_n", "MonitoringLocationName", "TADA.LatitudeMeasure",
  "TADA.LongitudeMeasure", "median", "sd", "TADA.ComparableDataIdentifier",
  "desc", "Legend", "roundRV", "TADA.DuplicateID", "maxRV", "within10",
  "AllGroups", "Domain.Value.Status", "Char_Flag", "Comparable.Name",
  "TADA.ResultMeasureValue1", "TADA.ResultSampleFractionText",
  "TADA.MethodSpeciationName", "TADA.ResultMeasure.MeasureUnitCode",
  "TADA.ActivityMediaName", "TADA.NutrientSummationGroup",
  "SummationName", "SummationRank", "SummationFractionNotes", "SummationSpeciationNotes",
  "SummationSpeciationConversionFactor", "SummationNote", "NutrientGroup",
  "Target.Speciation", "TADA.NearbySiteGroups", "numres", "TADA.SingleOrgDupGroupID",
  "TADA.MeasureQualifierCode.Flag", "TADA.MeasureQualifierCode.Def", "MeasureQualifierCode",
  "value", "Flag_Column", "Data_NCTCShepherdstown_HUC12", "ActivityStartDateTime",
  "TADA.MultipleOrgDupGroupID", "TADA.WQXVal.Flag", "Concat", ".", "MeasureQualifierCode.Split",
  "TADA.Media.Flag", "ML.Media.Flag", "TADA.UseForAnalysis.Flag",
  "Unique.Identifier", "Domain", "Note.Recommendation", "Conversion.Coefficient",
  "Conversion.Coefficient", "Last.Change.Date", "Value", "Minimum", "Unique.Identifier",
  "Domain", "ResultMeasure.MeasureUnitCode", "Comb", "CombList",
  "TADA.Target.ResultMeasure.MeasureUnitCode", "TADA.WQXUnitConversionFactor",
  "TADA.WQXUnitConversionCoefficient", "TADA.Target.MethodSpeciationName",
  "flag", "NConvert", "MultUnits", "CharList", "CharUnit", "SingleNearbyGroup",
  "TADA.MultipleOrgDuplicate", "TADA.ResultSelectedMultipleOrgs", "Maximum",
  "OBJECTID", "GLOBALID", "assessmentunitidentifier", "index", "epsg",
  "ResultMeasure.MeasureUnitCode", "TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode",
  "DetectionQuantitationLimitMeasure.MeasureUnitCode", "NCode",
  "ATTAINS.assessmentunitidentifier", "ATTAINS_AU", "TOTALAREA_MI", "TOTALAREA_KM",
  "ATTAINS_AUs", "ARD_Category", "ActivityRelativeDepthName", "DepthsByGroup",
  "DepthsPerGroup", "MeanResults", "MonitoringLocationTypeName", "N", "SecchiConversion",
  "TADA.ActivityBottomDepthHeightMeasure.MeasureValue",
  "TADA.ActivityDepthHeightMeasure.MeasureUnitCode", "TADA.ActivityDepthHeightMeasure.MeasureValue",
  "TADA.CharacteristicsForDepthProfile TADA.ConsolidatedDepth",
  "TADA.ConsolidatedDepth.Bottom TADA.ConsolidatedDepth.Unit", "TADA.DepthCategory.Flag",
  "TADA.DepthProfileAggregation.Flag", "TADA.NResults",
  "TADA.ResultDepthHeightMeasure.MeasureUnitCode", "TADA.ResultDepthHeightMeasure.MeasureValue",
  "YAxis.DepthUnit", "TADA.CharacteristicsForDepthProfile", "TADA.ConsolidatedDepth",
  "TADA.ConsolidatedDepth.Bottom", "TADA.ConsolidatedDepth.Unit", "col2rgb",
  "palette.colors", "rect", "rgb", "text", "CodeNoSpeciation", "ResultMeasure.MeasureUnitCode.Upper",
  "TADA.MonitoringLocationIdentifier", "StringA", "StringB", "MeasureUnitCode.match",
  "TADA.ActivityTopDepthHeightMeasure.MeasureValue", "group_id", "time_diff_lead", "time_diff_lag",
  "NResults", "missing.group", "TADA.PairingGroup", "TADA.PairingGroup.Rank", "timediff",
  "TADA.MonitoringLocationName", "TADA.MonitoringLocationTypeName",
  "ATTAINS.submissionid", "HorizontalCoordinateReferenceSystemDatumName",
  "NCount", "NHD.catchmentareasqkm", "NHD.comid", "NHD.nhdplusid", "NHD.resolution",
  "areasqkm", "assessmentUnitIdentifier", "catchmentareasqkm", "comid",
  "featureid", "geometry", "nhdplusid", "waterTypeCode", "TADA.NearbySiteGroup",
  "TADA.MonitoringLocationIdentifier.New", "TADA.NearbySites.Flag", "CountSites", "Group",
  "Matrix", "n_id", "OrgRank", "rank.default", "Site", "TADA.LatitudeMeasure.New",
  "TADA.LongitudeMeasure.New", "TADA.MonitoringLocationName.New",
  "TADA.MonitoringLocationTypeName.New", "df_number", "ASSESSMENT_UNIT_ID",
  "ATTAINS.FlagParameterName", "ATTAINS.FlagUseName", "ATTAINS.ParameterName",
  "CRITERIATYPEAQUAHUMHLTH", "CRITERIATYPEFRESHSALTWATER", "CRITERIATYPE_ACUTECHRONIC",
  "CRITERIATYPE_WATERORG", "CRITERION_VALUE", "ENTITY_ABBR", "EPA304A.PollutantName",
  "IncludeOrExclude", " MONITORING_DATA_LINK_TEXT", "MONITORING_DATA_LINK_TEXT.New",
  "MS_LOCATION_ID", "MS_ORG_ID", "MonitoringDataLinkText", "OrgIDForURL", "POLLUTANT_NAME",
  "ProviderName", "TADA.SingleOrgDup.Flag", "UNIT_NAME", "URLencode", "USE_CLASS_NAME_LOCATION_ETC",
  "assessment_unit_identifier", "monitoring_data_link_text", "monitoring_location_identifier",
  "monitoring_organization_identifier", "monitoring_stations", "organization_identifier",
  "organization_identifier.y", "parameter", "use_name", "use_name.y",
  "MONITORING_DATA_LINK_TEXT", "PARCEL_NO", "TRIBE_NAME", "everything",
  "resultCount", "tribal_area", "txtProgressBar"
))

# global variables for tribal feature layers used in TADA_OverviewMap in Utilities.R
AKAllotmentsUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0/query"
AKVillagesUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1/query"
AmericanIndianUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query"
OffReservationUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3/query"
OKTribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4/query"
VATribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5/query"

#' Decimal Places
#'
#' for numeric data type
#'
#' @param x Numeric data field from TADA profile
#'
#' @return Number of values to the right of the decimal point for numeric
#' type data.
TADA_DecimalPlaces <- function(x) {
  if (abs(x - round(x)) > .Machine$double.eps^0.5) {
    nchar(strsplit(sub("0+$", "", as.character(x)), ".", fixed = TRUE)[[1]][[2]])
  } else {
    return(0)
  }
}

#' Insert Breaks
#'
#' This function inserts a new line into a string when it exceeds a
#' user-specified length. New lines are added to spaces in the string. This is
#' intended to make plot legends more readable and tidy.
#'
#' @param x A vector of strings
#' @param len The maximum character length a string can be before the function
#'   searches for the best space to insert a new line.
#'
#' @return The same vector of strings with new lines added where appropriate.
#'
#' @export
TADA_InsertBreaks <- function(x, len = 50) {
  if (nchar(x) > len) {
    multiples <- floor(nchar(x) / len)
    lens <- seq(len, len * multiples, by = len)
    spaces <- unlist(gregexpr(" ", x))
    if (max(spaces) > len) {
      spots <- sapply(lens, function(x) spaces[min(which(spaces > x))])
      for (i in 1:length(spots)) {
        stringi::stri_sub(x, spots[i] + (i - 1), spots[i]) <- "\n "
      }
    }
  }
  return(x)
}

#' Check Type
#'
#' This function checks if the inputs to a function are of the expected type. It
#' is used at the beginning of TADA functions to ensure the
#' inputs are suitable.
#'
#' @param arg An input argument to check
#' @param type Expected class of input argument
#' @param paramName Optional name for argument to use in error message
TADA_CheckType <- function(arg, type, paramName) {
  if ((type %in% class(arg)) == FALSE) {
    # if optional parameter name not specified use arg in errorMessage
    if (missing(paramName)) {
      paramName <- arg
    }
    errorMessage <- paste(paramName, " must be of class '", type, "'", sep = "")
    return(stop(errorMessage))
  }
}

#' Check Columns
#'
#' This function checks if the expected column names are in the dataframe. It is
#' used at the beginning of TADA functions to ensure the input data frame is
#' suitable (i.e. is either the full physical/chemical results profile
#' downloaded from WQP or the TADA profile template downloaded from the EPA TADA
#' webpage.)
#'
#' @param .data A dataframe
#' @param expected_cols A vector of expected column names as strings
TADA_CheckColumns <- function(.data, expected_cols) {
  if (all(expected_cols %in% colnames(.data)) == FALSE) {
    stop("The dataframe does not contain the required fields. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
  }
}

#' TADA_ConvertSpecialChars
#'
#' This function will screen a column of the user's choice for special
#' characters. It creates a NEW column that describes the content of the column
#' prior to conversion to numeric (named "TADA.COLUMN NAME DataTypes.Flag"). It
#' also creates a NEW column to hold the new, numeric format (named "TADA.COLUMN
#' NAME"). This function will successfully convert some special character
#' formats to numeric: whitespace, >, <, ~, %, and commas are removed before
#' converting a result value to numeric. Result values in the format # - # are
#' converted to an average of the two numbers. Result values
#' containing any other text or non-numeric characters become NA in
#' the newly created "TADA.COLUMN NAME" and labeled accordingly in "TADA.COLUMN
#' NAME DataTypes.Flag".
#'
#'
#' @param .data A TADA profile object
#' @param col A character column to be converted to numeric
#'
#' @param percent.ave Boolean argument; default is percent.ave = TRUE. When
#' clean = TRUE, any percent range values will be averaged. When
#' percent.ave = FALSE, percent range values are not averaged, but are flagged.
#'
#' @return Returns the original dataframe with two new columns: the input column
#' with the prefix "TADA.", which holds the numeric form of the original column,
#' and "TADA.COLUMN NAME DataTypes.Flag", which has text describing the type of
#' data contained within the column of interest, including "Numeric",
#' "Less Than" (<), "Greater Than" (>), "Approximate Value" (~), "Text" (A-z),
#' "Percentage" (%), "Comma-Separated Numeric" (#,###), and
#' "Numeric Range - Averaged" (# - #).
#'
#' @export
#'
#' @examples
#' HandleSpecialChars_ResultMeasureValue <-
#'   TADA_ConvertSpecialChars(Data_Nutrients_UT, "ResultMeasureValue")
#' unique(HandleSpecialChars_ResultMeasureValue$
#'   TADA.ResultMeasureValueDataTypes.Flag)
#'
#' HandleSpecialChars_DetLimMeasureValue <-
#'   TADA_ConvertSpecialChars(
#'     Data_Nutrients_UT,
#'     "TADA.DetectionQuantitationLimitMeasure.MeasureValue"
#'   )
#' unique(HandleSpecialChars_DetLimMeasureValue$
#'   TADA.DetectionQuantitationLimitMeasure.MeasureValueDataTypes.Flag)
TADA_ConvertSpecialChars <- function(.data, col, percent.ave = TRUE) {
  if (!col %in% names(.data)) {
    stop("Suspect column name specified for input dataset.")
  }

  # Define new column names
  numcol <- paste0("TADA.", col)
  flagcol <- paste0("TADA.", col, "DataTypes.Flag")

  # Create dummy columns for easy handling in function
  chars.data <- .data
  names(chars.data)[names(chars.data) == col] <- "orig"
  chars.data <- chars.data %>%
    dplyr::select(-tidyselect::any_of(c(col, numcol, flagcol)))
  chars.data$masked <- chars.data$orig

  # Add percentage character to dissolved oxygen saturation ResultMeasureValue
  # so percentage and percentage - range averaged can be identified correctly
  if (col == "ResultMeasureValue") {
    do.units <- c("%", "% SATURATN")

    chars.data$masked <- ifelse(chars.data$CharacteristicName == "Dissolved oxygen (DO)" & chars.data$ResultMeasure.MeasureUnitCode %in% do.units,
      paste(chars.data$masked, "%"), chars.data$masked
    )

    # updates percentage units where NA
    chars.data$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
      grepl("%", chars.data$masked), "%", chars.data$ResultMeasure.MeasureUnitCode
    )

    # TADA.ResultMeasure.MeasureUnitCode to uppercase
    chars.data$TADA.ResultMeasure.MeasureUnitCode <- toupper(chars.data$TADA.ResultMeasure.MeasureUnitCode)
  }

  # If column is already numeric, just discern between NA and numeric
  if (is.numeric(chars.data$orig)) {
    clean.data <- chars.data %>%
      dplyr::mutate(flag = dplyr::case_when(
        is.na(masked) ~ as.character("NA - Not Available"),
        TRUE ~ as.character("Numeric")
      ))
  } else {
    chars.data$masked <- gsub(" ", "", chars.data$masked) # get rid of white space for subsequent sorting
    # Detect special characters in column and populate new flag column with descriptor
    # of the specific type of character/data type
    clean.data <- chars.data %>%
      dplyr::mutate(
        flag = dplyr::case_when(
          is.na(masked) ~ as.character("NA - Not Available"),
          (!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"),
          (grepl("<", masked) == TRUE) ~ as.character("Less Than"),
          (grepl(">", masked) == TRUE) ~ as.character("Greater Than"),
          (grepl("~", masked) == TRUE) ~ as.character("Approximate Value"),
          (grepl("[A-Za-z]", masked) == TRUE) ~ as.character("Text"),
          (grepl("%", masked) == TRUE) ~ as.character("Percentage"),
          (grepl(",", masked) == TRUE) ~ as.character("Comma-Separated Numeric"),
          (grepl("\\d\\-\\d", masked) == TRUE) ~ as.character("Numeric Range - Averaged"),
          (grepl("([1-9]|[1-9][0-9]|100)-([1-9]|[1-9][0-9]|100)%", masked) == TRUE) ~ as.character("Percentage Range - Averaged"),
          # because * is a special character you have to escape\\ it:
          (grepl("\\*", masked) == TRUE) ~ as.character("Approximate Value"),
          (!stringi::stri_enc_mark(masked) %in% c("ASCII")) ~ as.character("Non-ASCII Character(s)"),
          TRUE ~ "Coerced to NA"
        ),
        flag = ifelse(flag == "Greater Than" & grepl("%", masked) & grepl("-", masked),
          "Percentage Range - Averaged", flag
        ),
        flag = ifelse(flag == "Less Than" & grepl("%", masked) & grepl("-", masked),
          "Percentage Range - Averaged", flag
        )
      )
  }

  if (percent.ave == FALSE) {
    num.range.filter <- c("Numeric Range - Averaged")
  }

  if (percent.ave == TRUE) {
    num.range.filter <- c("Numeric Range - Averaged", "Percentage Range - Averaged")
  }

  # Result Values that are numeric ranges with the format #-# are converted to an average of the two numbers expressed in the range.
  if (any(clean.data$flag %in% num.range.filter)) {
    numrange <- subset(clean.data, clean.data$flag %in% num.range.filter)
    notnumrange <- subset(clean.data, !clean.data$flag %in% num.range.filter)
    numrange <- numrange %>%
      dplyr::mutate(
        masked = stringr::str_remove(masked, "[1-9]\\)"),
        masked = stringr::str_remove(masked, "%"),
        masked = stringr::str_remove(masked, ">"),
        masked = stringr::str_remove(masked, "<")
      ) %>%
      tidyr::separate(masked, into = c("num1", "num2"), sep = "-", remove = TRUE) %>%
      dplyr::mutate_at(c("num1", "num2"), as.numeric)
    numrange$masked <- as.character(rowMeans(numrange[, c("num1", "num2")], na.rm = TRUE))
    numrange <- numrange[, !names(numrange) %in% c("num1", "num2")] %>%
      dplyr::mutate(masked = ifelse(flag == "Percentage Range - Average", paste(masked, "%", sep = ""), masked))

    clean.data <- plyr::rbind.fill(notnumrange, numrange)
  }

  # In the new TADA column, convert to numeric and remove some specific special
  # characters.
  clean.data$masked <- suppressWarnings(as.numeric(stringr::str_replace_all(
    clean.data$masked, c("<" = "", ">" = "", "~" = "", "%" = "", "\\*" = "", "1\\)" = "")
  )))

  # this updates the DataTypes.Flag to "NA - Not Available" if NA
  clean.data$flag <- ifelse(
    is.na(clean.data$flag),
    "NA - Not Available",
    clean.data$flag
  )

  # remove columns to be replaced
  clean.data <- clean.data %>%
    dplyr::select(!(tidyselect::any_of(numcol)), !(tidyselect::any_of(flagcol)))

  # Rename to original column name, TADA column name, and flag column name
  names(clean.data)[names(clean.data) == "orig"] <- col
  names(clean.data)[names(clean.data) == "masked"] <- numcol
  names(clean.data)[names(clean.data) == "flag"] <- flagcol

  clean.data <- TADA_OrderCols(clean.data)

  return(clean.data)
}

#' Substitute Preferred Characteristic Name for Deprecated Names
#'
#' This function uses the WQX Characteristic domain table to substitute
#' deprecated (i.e. retired and/or suspect) Characteristic Names with the new
#' name in the TADA.CharacteristicName column. TADA_SubstituteDeprecatedChars is
#' run within TADA_AutoClean, which runs within TADA_DataRetrieval and
#' (if autoclean = TRUE) in TADA_BigDataRetrieval. Therefore, deprecated
#' characteristic names are harmonized to the new name automatically upon data
#' retrieval. TADA_SubstituteDeprecatedChars can also be used by itself on a
#' user supplied dataset that is in the WQX/WQP format, if desired. This
#' solution works for both EPA WQX and USGS NWIS provided data.
#'
#' Enter ?TADA_GetCharacteristicRef() to review a list of all WQX
#' characteristics, the including deprecated names (Char_Flag). This can be
#' used as a crosswalk between the deprecated names (CharacteristicName) and
#' their new names (Comparable.Name).
#'
#' @param .data TADA dataframe
#'
#' @return Input TADA dataframe with substituted characteristic names in
#'   TADA.CharacteristicName column. Original columns are unchanged.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # download nutrient data in MT from 2022 and set autoclean = FALSE
#' df <- TADA_DataRetrieval(
#'   startDate = "2022-01-01",
#'   endDate = "2022-12-31",
#'   characteristicType = "Nutrient",
#'   statecode = "MT",
#'   applyautoclean = FALSE, ask = FALSE
#' )
#' df2 <- TADA_SubstituteDeprecatedChars(df)
#' # in this example, "Inorganic nitrogen (nitrate and nitrite)" is a USGS NWIS
#' # characteristic that is deprecated and
#' # "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms"
#' # is a deprecated WQX name. Both are are transformed to their new names.
#' # review characteristic names before and after transformation
#' unique(df2$CharacteristicName)
#' unique(df2$TADA.CharacteristicName)
#'
#' df3 <- TADA_DataRetrieval(
#'   startDate = "2022-01-01", endDate = "2022-12-31",
#'   characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE,
#'   ask = FALSE
#' )
#' df4 <- TADA_SubstituteDeprecatedChars(df3)
#' unique(df4$CharacteristicName)
#' unique(df4$TADA.CharacteristicName)
#' }
TADA_SubstituteDeprecatedChars <- function(.data) {
  TADA_CheckColumns(.data, expected_cols = c("CharacteristicName"))

  if ("TADA.CharacteristicName" %in% colnames(.data)) {
    .data <- .data
  } else {
    # create uppercase version of original CharacteristicName
    .data$TADA.CharacteristicName <- toupper(.data$CharacteristicName)
  }

  # read in characteristic reference table with deprecation information, filter to deprecated terms and for "retired" in CharacteristicName.
  # remove all characters after first "*" in CharacteristicName and remove any leading or trailing white space to make compatible with deprecated NWIS CharacteristicName.
  nwis.table <- utils::read.csv(system.file("extdata", "WQXCharacteristicRef.csv", package = "EPATADA")) %>%
    dplyr::filter(
      Char_Flag == "Deprecated",
      grepl("retired", CharacteristicName)
    ) %>%
    dplyr::mutate(CharacteristicName = trimws(stringr::str_split(CharacteristicName, "\\*", simplify = T)[, 1]))

  # read in characteristic reference table with deprecation information and filter to deprecated terms.
  # join with deprecated NWIS CharacteristicName data.frame.
  ref.table <- utils::read.csv(system.file("extdata", "WQXCharacteristicRef.csv", package = "EPATADA")) %>%
    dplyr::filter(Char_Flag == "Deprecated") %>%
    rbind(nwis.table)

  rm(nwis.table)

  # merge to dataset
  .data <- merge(.data, ref.table, all.x = TRUE)
  # if CharacteristicName is deprecated and comparable name is not blank (NA), use the provided Comparable.Name. Otherwise, keep TADA.CharacteristicName as-is.
  .data$TADA.CharacteristicName <- ifelse(!is.na(.data$Char_Flag) & !.data$Comparable.Name %in% c(""), .data$Comparable.Name, .data$TADA.CharacteristicName)

  howmany <- length(.data$Char_Flag[!is.na(.data$Char_Flag)])

  if (howmany > 0) {
    chars <- unique(.data$CharacteristicName[!is.na(.data$Char_Flag)])
    chars <- paste0(chars, collapse = "; ")
    print(paste0(howmany, " results in your dataset have one of the following deprecated characteristic names: ", chars, ". These names have been substituted with the updated preferred names in the TADA.CharacteristicName field."))
  } else {
    print("No deprecated characteristic names found in dataset.")
  }

  .data <- .data %>% dplyr::select(-Char_Flag, -Comparable.Name)
  .data <- TADA_OrderCols(.data)
  return(.data)
}

#' Create TADA.ComparableDataIdentifier Column
#'
#' This utility function creates the TADA.ComparableDataIdentifier column by pasting
#' together TADA.CharacteristicName, TADA.ResultSampleFractionText, TADA.MethodSpeciationName,
#' and TADA.ResultMeasure.MeasureUnitCode.
#'
#' @param .data TADA dataframe
#'
#' @return Input TADA dataframe with added TADA.ComparableDataIdentifier column.
#'
#' @export
TADA_CreateComparableID <- function(.data) {
  TADA_CheckColumns(.data,
    expected_cols = c(
      "TADA.CharacteristicName",
      "TADA.ResultSampleFractionText",
      "TADA.MethodSpeciationName",
      "TADA.ResultMeasure.MeasureUnitCode"
    )
  )
  .data$TADA.ComparableDataIdentifier <-
    paste(.data$TADA.CharacteristicName,
      .data$TADA.ResultSampleFractionText,
      .data$TADA.MethodSpeciationName,
      .data$TADA.ResultMeasure.MeasureUnitCode,
      sep = "_"
    )
  return(.data)
}

#' Convert a delimited string to the format used by WQX 3.0 profiles for
#' one-to-manys
#'
#' This utility function takes a delimited string of entities, and a delimiter
#' (which defaults to a comma) and returns a new string in the WQX 3.0 format
#' of c("StringA","StringB").
#'
#' @param delimited_string Character argument. Should be a string delimited
#' by the character passed in the delimiter parameter.
#'
#' @param delimiter Character argument The character used to delimit the
#' string passed in delimited_string. Defaults to a comma.
#'
#' @return String.
#'
#' @export
TADA_FormatDelimitedString <- function(delimited_string, delimiter = ",") {
  esc_chars <- c("|", "^", "&", ".", "!", "?", "\\", "*", "-", "+", ">", "<")
  if (delimiter %in% esc_chars) {
    delimiter <- paste0("\\", delimiter)
  }
  return(paste0('["', gsub(delimiter, '","', delimited_string), '"]'))
}

#' Identify and group nearby monitoring locations (UNDER ACTIVE DEVELOPMENT)
#'
#' This function takes a TADA dataset and identifies the NHD catchments that
#' each MonitoringLocation is in. Within each group of MonitoringLocations in
#' the same catchment, a distance matrix is created and an adjacency matrix
#' is used to identify groups of nearby sites within the same catchment.
#' Groups of nearby sites are given a new TADA.MonitoringLocationIdentifier
#' which is created by concatenating the original
#' TADA.MonitoringLocationIdentifiers of all sites within the group. Two
#' additional columns, TADA.NearbySiteGroup and TADA.NearbySites.Flag are added.
#' TADA.NearbySiteGroup contains a unique numeric value for each group of sites
#' within the same catchment. TADA.NearbySites.Flag identifies whether or not
#' a result is from a grouped site or not and for grouped sites identifies how
#' the TADA prefixed metadata columns (TADA.MonitoringLocationName,
#' TADA.MonitoringLocationTypeName, TADA.LongitudeMeasure, and
#' TADA.LatitudeMeasure) were determined.
#'
#' @param .data TADA dataframe OR TADA sites dataframe.
#'
#' @param dist_buffer Numeric. The maximum distance (in meters) two sites can be
#'   from one another to be considered "nearby" and grouped together.
#'
#' @param nhd_res Character argument to determine whether the NHD catchments
#' used should be high ("Hi") or medium ("Med") res. Default = "Hi" for
#' consistency with other TADA geospatial functions.
#'
#' @param org_hierarchy Vector of organization identifiers that acts as the
#' order in which the function should select representative metadata for
#' grouped sites based on the organization that collected the data. If left
#' blank, the function does not factor organization in to the metadata
#' selection process. When a vector is provided, the metadata will first be
#' selected by organization and the "meta_select" argument will only be
#' applied in cases where more than one set of metadata per site grouping are
#' available from the highest ranking organization available.
#'
#' @param meta_select Character argument to determine how metadata should be
#' selected if no org_hierarchy is specified or if multiple options for metadata
#' from the same organization exist. Options are "oldest", which selects the
#' metadata associated with the oldest result from the grouped nearby sites,
#' "newest", which selects the metadata associated with the newest result from
#' the grouped nearby sites, "count" which selects the metadata associated with
#' the greatest number of results, and "random" which selects random metadata
#' from the site group. The default is meta_select = "random".
#'
#' @return Input dataframe with a TADA.SiteGroup column that indicates the
#' nearby site group each monitoring location belongs to. Grouped sites are
#' concatenated in the TADA.MonitoringLocationIdentifier column
#' (e.g. "USGS-10010025","USGS-10010026" enclosed in square brackets []).
#' This JSON array is the new TADA monitoring location ID for the grouped sites.
#' TADA.MonitoringLocationIdentifier can be leveraged to analyze data from
#' nearby sites together (as the same general location). Related metadata,
#' including TADA.MonitoringLocationName, TADA.LatitudeMeasure,
#' TADA.LongitudeMeasure, and TADA.MonitoringLocationTypeName are added to the
#' input df. Meta data selection is determined by user inputs as users may
#' provide an organization hierarchy to determine which organization's
#' metadata should be preferentially selected and further specify whether
#' metadata should be selected: randomly, by the oldest or newest sampling date,
#' or by the site with the greatest number of overall results in the TADA df.
#'
#' @export
#'
#' @examples
#' GroupNearbySites_100m <- TADA_FindNearbySites(Data_Nutrients_UT)
#' GroupNearbySites_10m <- TADA_FindNearbySites(Data_Nutrients_UT,
#'   dist_buffer = 10
#' )
TADA_FindNearbySites <- function(.data, dist_buffer = 100,
                                 nhd_res = "Hi",
                                 org_hierarchy = "none",
                                 meta_select = "random") {
  # check .data is data.frame
  TADA_CheckType(.data, "data.frame", "Input object")

  # .data required columns
  required_cols <- c(
    "TADA.MonitoringLocationIdentifier",
    "TADA.LongitudeMeasure",
    "TADA.LatitudeMeasure"
  )

  # check .data has required columns
  TADA_CheckColumns(.data, required_cols)

  rm(required_cols)

  # retain only necessary columns unique Monitoring Locations
  data_unique_mls <- .data %>%
    dplyr::select(
      TADA.MonitoringLocationIdentifier, TADA.LongitudeMeasure, TADA.LatitudeMeasure,
      HorizontalCoordinateReferenceSystemDatumName
    ) %>%
    dplyr::rename(
      LongitudeMeasure = TADA.LongitudeMeasure,
      LatitudeMeasure = TADA.LatitudeMeasure,
      MonitoringLocationIdentifier = TADA.MonitoringLocationIdentifier
    ) %>%
    dplyr::distinct()

  # convert to sf object
  data_unique_mls <- TADA_MakeSpatial(data_unique_mls)

  # fetch nhdplus catchment information
  nhd_catchments <- fetchNHD(data_unique_mls, resolution = nhd_res)

  # join catchment data to TADA df, count how many times each catchment is included in df and
  # retain only rows containing catchments which are included more than once in the df
  data_unique_mls <- data_unique_mls %>%
    sf::st_join(nhd_catchments, left = TRUE) %>%
    dplyr::rename(
      TADA.MonitoringLocationIdentifier = MonitoringLocationIdentifier,
      TADA.LongitudeMeasure = LongitudeMeasure,
      TADA.LatitudeMeasure = LatitudeMeasure
    ) %>%
    dplyr::group_by(NHD.nhdplusid) %>%
    dplyr::mutate(n_id = length(TADA.MonitoringLocationIdentifier)) %>%
    dplyr::filter(n_id > 1) %>%
    dplyr::ungroup() %>%
    dplyr::filter(!is.na(NHD.nhdplusid))

  if (nrow(data_unique_mls) == 0) { # #if no groups, give a TADA.NearbySiteGroup column filled with
    # "No nearby sites"
    print("TADA_FindNearbySites: No nearby sites detected using input buffer distance. Columns for TADA.NearbySitesFlag and TADA.NearbySiteGroup added for tracking purposes.")

    .data <- .data %>%
      dplyr::mutate(
        TADA.NearbySites.Flag = "No nearby sites detected using input buffer distance.",
        TADA.NearbySiteGroup = NA
      )

    rm(data_unique_mls, nhd_catchments)

    return(.data)
  }

  # check to see if any site groups are found
  if (nrow(data_unique_mls) > 1) {
    # remove intermediate object
    rm(nhd_catchments)

    # need to split into separate dfs based on nhdplusid group
    df_nhdgroups_list <- split(data_unique_mls, data_unique_mls$NHD.nhdplusid)

    rm(data_unique_mls)

    # prepare dfs for distance matrix
    df_nhdgroups_prep <- df_nhdgroups_list %>%
      purrr::map(~ .x %>%
        dplyr::select(
          TADA.MonitoringLocationIdentifier,
          TADA.LongitudeMeasure,
          TADA.LatitudeMeasure
        ))

    rm(df_nhdgroups_list)

    # create a distance matrix in meters
    dist.mats <- purrr::map(df_nhdgroups_prep, ~ {
      dist.matrix <- as.matrix(sf::st_distance(.x)) # Great Circle distance since in lat/lon

      rownames(dist.matrix) <- .x$TADA.MonitoringLocationIdentifier
      colnames(dist.matrix) <- .x$TADA.MonitoringLocationIdentifier

      dist.matrix
    })

    # remove intermediate object
    rm(df_nhdgroups_prep)

    # remove units from distance matrix
    dist.mats <- purrr::map(dist.mats, ~ units::drop_units(.x))

    # convert distances to those within buffer (1) and beyond buffer (0)
    binary.mats <- purrr::map(dist.mats, function(mat) {
      bin.mat <- apply(mat, 2, function(x) as.integer(x <= dist_buffer))
      diag(bin.mat) <- 0
      rownames(bin.mat) <- rownames(mat)
      colnames(bin.mat) <- colnames(mat)
      #
      return(bin.mat)
    })

    rm(dist.mats)

    # loop through distance matrix and extract site groups
    find_groups <- function(matrix) {
      # create adjacency graph
      adj.graph <- igraph::graph_from_adjacency_matrix(matrix, mode = "undirected", diag = FALSE)

      # find connected sites
      comp.results <- igraph::components(adj.graph)

      # create site group dfs
      group_sites <- data.frame(
        Site = names(comp.results$membership),
        Group = comp.results$membership,
        row.names = NULL
      )

      return(group_sites)
    }

    # apply site grouping function to matrix
    site.groups.list <- purrr::map(binary.mats, find_groups)

    site.groups.list <- purrr::imap(site.groups.list, ~ .x %>%
      dplyr::mutate(df_number = .y))

    rm(find_groups, binary.mats)

    # create df of all groups and create unique id for each group
    combined.group.df <- dplyr::bind_rows(site.groups.list) %>%
      dplyr::group_by(Group, df_number) %>%
      dplyr::mutate(Count = length(Site)) %>%
      dplyr::filter(Count > 1) %>%
      # create new TADA.MonitoringLocationIdentifier
      dplyr::mutate(
        TADA.MonitoringLocationIdentifier.New = paste(Site, collapse = ", "),
        TADA.MonitoringLocationIdentifier.New = paste0(
          "[",
          TADA.MonitoringLocationIdentifier.New,
          "]"
        ),
        TADA.NearbySiteGroup = dplyr::cur_group_id()
      ) %>%
      dplyr::ungroup()

    rm(site.groups.list)

    # create crosswalk of TADA.MonitoringLocationIdentifer and new TADA.MonitoringLocationIdentifier
    ml.crosswalk <- combined.group.df %>%
      dplyr::select(Site, TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup) %>%
      dplyr::rename(TADA.MonitoringLocationIdentifier = Site) %>%
      dplyr::distinct()

    rm(combined.group.df)

    # create df of grouped sites, including all activity start dates for the sites
    grouped.sites <- ml.crosswalk %>%
      dplyr::left_join(.data, by = dplyr::join_by(TADA.MonitoringLocationIdentifier)) %>%
      dplyr::select(
        TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup,
        TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure,
        TADA.MonitoringLocationTypeName, ActivityStartDate, OrganizationIdentifier
      ) %>%
      dplyr::distinct()

    # create a df of unique grouped sites, do not include any activity start dates
    grouped.no.dates <- grouped.sites %>%
      dplyr::select(-ActivityStartDate) %>%
      dplyr::distinct()

    # create list of orgs from TADA df
    all.orgs <- unique(.data$OrganizationIdentifier)

    # compare list of orgs from TADA df to user supplied org_hierachy to find missing orgs
    missing.orgs <- setdiff(all.orgs, org_hierarchy)

    # create string for flagging based on meta_select
    if (meta_select == "random") {
      meta.string <- "random selection"
    }

    if (meta_select == "oldest") {
      meta.string <- "oldest sampling date"
    }

    if (meta_select == "newest") {
      meta.string <- "most reccent sampling date"
    }

    if (meta_select == "count") {
      meta.string <- "greatest number of results in TADA data frame"
    }

    # use org hierarchy for first round of metadata selection
    if (isTRUE(org_hierarchy == "none")) {
      # create string for flagging
      org.string <- "Metadata were selected by "


      # print message
      print("TADA_FindNearbySites: No org_hierarchy supplied by user. Organization will not be taken into account during metadata selection.")

      # create consistent org rank to facilitate meta data selection (all orgs ranked equally)
      org.ranks <- as.data.frame(all.orgs) %>%
        dplyr::mutate(OrgRank = 99) %>%
        dplyr::rename(OrganizationIdentifier = all.orgs)
    }

    # if org hierarchy is supplied by user
    if (org_hierarchy[1] != "none") {
      # create string for flagging
      org.string <- "Metadata were selected by filtering based on the user supplied hierarchy, then by "

      if (!is.vector(org_hierarchy)) {
        stop("TADA_FindNearbySites: Organization hierarchy must be supplied as a vector.")
      }

      if (length(org_hierarchy) == 0) {
        stop("TADA_FindNearbySites: No organization identifiers were supplied.")
      }

      if (length(missing.orgs) > 0) {
        print(paste0(
          "TADA_FindNearbySites: ", length(missing.orgs),
          " organization identifiers are missing from org_hierarchy (",
          stringi::stri_replace_last(paste(missing.orgs, collapse = ", "),
            fixed = ", ", " and "
          ), ").",
          " Function will continue to run using partial org_hierarchy."
        ))

        # create df for organization ranks from user-supplied hierarchy
        org.ranks <- as.data.frame(org_hierarchy) %>%
          dplyr::mutate(OrgRank = dplyr::row_number()) %>%
          dplyr::rename(OrganizationIdentifier = org_hierarchy)

        # create df for all organizations missing from user-supplied hierarchy
        # all missing orgs will share the same rank and be ranked below any orgs supplied by user
        missing.ranks <- as.data.frame(missing.orgs) %>%
          dplyr::mutate(OrgRank = (length(org_hierarchy) + 1)) %>%
          dplyr::rename(OrganizationIdentifier = missing.orgs)

        # add missing orgs to org rank df
        org.ranks <- org.ranks %>%
          dplyr::bind_rows(missing.ranks)
      }

      if (length(missing.orgs) == 0) {
        # create df for organization ranks from user-supplied hierarchy
        org.ranks <- as.data.frame(org_hierarchy) %>%
          dplyr::mutate(OrgRank = dplyr::row_number()) %>%
          dplyr::rename(OrganizationIdentifier = org_hierarchy)
      }


      rm(all.orgs, missing.orgs)
    }

    # add org ranks to df of all TADA.MonitoringLocationIdentifier.New
    org.ranks.added <- grouped.no.dates %>%
      dplyr::left_join(org.ranks, by = dplyr::join_by(OrganizationIdentifier))

    rm(org.ranks)

    # filter to retain metadata for TADA.MonitoringLocation.New where there is only one set of
    # metadata from the highest ranked org
    org.meta.filter <- org.ranks.added %>%
      dplyr::group_by(TADA.NearbySiteGroup, OrgRank) %>%
      dplyr::mutate(CountSites = length(OrgRank)) %>%
      dplyr::filter(CountSites == 1) %>%
      dplyr::ungroup() %>%
      dplyr::select(-OrgRank, -CountSites) %>%
      dplyr::mutate(TADA.NearbySites.Flag = paste0(
        "This monitoring location was grouped with other nearby site(s). ",
        org.string, meta.string, "."
      ))

    # select and assign metadata randomly for grouped sites when meta_select equals "random"

    if (meta_select == "random") {
      # select random metadata where necessary (no org rank or more than one set of metdata for one
      # TADA.MonitoringLocationIdentifier.New)
      random.meta <- org.ranks.added %>%
        dplyr::ungroup() %>%
        dplyr::filter(!TADA.NearbySiteGroup %in%
          org.meta.filter$TADA.NearbySiteGroup) %>%
        dplyr::group_by(TADA.NearbySiteGroup) %>%
        dplyr::slice_min(OrgRank) %>%
        dplyr::select(
          TADA.MonitoringLocationIdentifier.New,
          TADA.MonitoringLocationName,
          TADA.LatitudeMeasure, TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName,
          TADA.NearbySiteGroup
        ) %>%
        dplyr::distinct() %>%
        dplyr::slice_sample(n = 1) %>%
        dplyr::ungroup()


      # join the metadata filtering results to create a df with all metadat to apply to TADA df by
      # TADA.MonitoringLocationIdentifier.New
      select.meta <- random.meta %>%
        dplyr::full_join(org.meta.filter, by = names(random.meta)) %>%
        dplyr::select(-OrganizationIdentifier) %>%
        dplyr::rename(
          TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
          TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
          TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
        ) %>%
        dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected randomly.")

      # remove intermediate objects
      rm(random.meta, org.ranks.added)
    }

    if (meta_select == "oldest" | meta_select == "newest") {
      # prep site groups for metadata selection by date
      date.meta <- grouped.sites %>%
        dplyr::left_join(org.ranks.added, by = dplyr::join_by(
          TADA.MonitoringLocationIdentifier.New,
          TADA.NearbySiteGroup,
          TADA.MonitoringLocationName,
          TADA.LatitudeMeasure,
          TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName,
          OrganizationIdentifier
        )) %>%
        dplyr::filter(!TADA.MonitoringLocationIdentifier.New %in%
          org.meta.filter$TADA.MonitoringLocationIdentifier.New) %>%
        dplyr::mutate(OrgRank = ifelse(is.na(OrgRank), rank.default, OrgRank)) %>%
        dplyr::group_by(TADA.MonitoringLocationIdentifier.New)

      if (meta_select == "oldest") {
        # select oldest metadata for group
        date.meta <- date.meta %>%
          dplyr::slice_min(ActivityStartDate)

        # specify oldest for flagging string
        date.choice <- "oldest"
      }

      if (meta_select == "newest") {
        # select newest metadata for group
        date.meta <- date.meta %>%
          dplyr::slice_max(ActivityStartDate)

        # specify newest for flagging string
        date.choice <- "newest"
      }

      # select metadata by date
      select.meta <- date.meta %>%
        dplyr::full_join(org.meta.filter, by = dplyr::join_by(
          TADA.MonitoringLocationIdentifier.New,
          TADA.NearbySiteGroup,
          TADA.MonitoringLocationName,
          TADA.LatitudeMeasure,
          TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName,
          OrganizationIdentifier
        )) %>%
        dplyr::select(-OrganizationIdentifier, -OrgRank, -ActivityStartDate) %>%
        dplyr::rename(
          TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
          TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
          TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
        ) %>%
        dplyr::group_by(TADA.NearbySiteGroup) %>%
        dplyr::slice_sample(n = 1) %>%
        dplyr::mutate(TADA.NearbySites.Flag = paste0(
          "This monitoring location was grouped with other",
          " nearby site(s). Metadata were selected from ",
          "the ", date.choice, " result available."
        ))

      rm(date.meta)
    }

    if (meta_select == "count") {
      # select metadata by finding site with greatest number of results in TADA df
      select.meta <- org.ranks.added %>%
        dplyr::left_join(.data, by = dplyr::join_by(
          TADA.MonitoringLocationName, TADA.LatitudeMeasure,
          TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName
        )) %>%
        dplyr::group_by(TADA.MonitoringLocationIdentifier) %>%
        dplyr::mutate(NCount = length(TADA.ResultMeasureValue)) %>%
        dplyr::ungroup() %>%
        dplyr::select(-TADA.MonitoringLocationIdentifier) %>%
        dplyr::distinct() %>%
        dplyr::group_by(TADA.NearbySiteGroup) %>%
        dplyr::slice_max(NCount) %>%
        dplyr::slice_sample(n = 1) %>%
        dplyr::select(
          TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup,
          TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName
        ) %>%
        dplyr::rename(
          TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
          TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
          TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
          TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
        ) %>%
        dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from MonitoringLocation with the most results available across all characteristics.")
    }

    # remove intermediate objects
    rm(grouped.no.dates, grouped.sites, org.meta.filter, org.string, meta.string)

    # remove site group from ml.crosswalk
    ml.crosswalk <- ml.crosswalk %>%
      dplyr::select(-TADA.NearbySiteGroup) %>%
      dplyr::distinct()

    # join selected metadata to TADA df
    .data <- .data %>%
      dplyr::left_join(ml.crosswalk, by = dplyr::join_by(TADA.MonitoringLocationIdentifier)) %>%
      dplyr::left_join(select.meta, by = dplyr::join_by(TADA.MonitoringLocationIdentifier.New)) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(
        TADA.MonitoringLocationName = ifelse(!is.na(TADA.MonitoringLocationName.New),
          TADA.MonitoringLocationName.New,
          TADA.MonitoringLocationName
        ),
        TADA.LatitudeMeasure = ifelse(!is.na(TADA.LatitudeMeasure.New),
          TADA.LatitudeMeasure.New,
          TADA.LatitudeMeasure
        ),
        TADA.LongitudeMeasure = ifelse(!is.na(TADA.LongitudeMeasure.New),
          TADA.LongitudeMeasure.New,
          TADA.LongitudeMeasure
        ),
        TADA.MonitoringLocationTypeName = ifelse(!is.na(TADA.MonitoringLocationTypeName.New),
          TADA.MonitoringLocationTypeName.New,
          TADA.MonitoringLocationTypeName
        ),
        TADA.MonitoringLocationIdentifier = ifelse(!is.na(TADA.MonitoringLocationIdentifier.New),
          TADA.MonitoringLocationIdentifier.New,
          TADA.MonitoringLocationIdentifier
        )
      ) %>%
      dplyr::select(
        -TADA.MonitoringLocationIdentifier.New, -TADA.MonitoringLocationName.New,
        -TADA.LatitudeMeasure.New, -TADA.LongitudeMeasure.New,
        -TADA.MonitoringLocationTypeName.New
      ) %>%
      TADA_OrderCols()

    # remove intermediate objects
    rm(select.meta, ml.crosswalk)

    # add flag for any ungrouped sites and order columns correctly
    .data <- TADA_OrderCols(.data) %>%
      dplyr::mutate(TADA.NearbySites.Flag = ifelse(is.na(TADA.NearbySites.Flag),
        "No nearby sites detected using input buffer distance.",
        TADA.NearbySites.Flag
      ))

    # return TADA df with added columns for tracking
    return(.data)
  }
}


#' Get grouped monitoring stations that are near each other
#'
#' This function takes a TADA dataset that contains grouped nearby monitoring stations
#' and returns a unique dataset of the original MonitoringLocationIdentifier, the grouped
#' TADA.MonitoringLocationIdentifier, as well as the original and TADA-prefixed LongitudeMeasure,
#' LatitudeMeasure, MonitoringLocationName, and MonitoringLocationTypeName, filtered for only those
#' stations that have a nearby station.
#'
#' @param .data TADA dataframe
#'
#' @return New dataframe with unique combinations of original and TADA MonitoringLocationIdentifier,
#' LongitudeMeasure, LatitudeMeasure, MonitoringLocationName, and MonitoringLocationTypeName.
#'
#' @export
TADA_GetUniqueNearbySites <- function(.data) {
  # check .data is data.frame
  TADA_CheckType(.data, "data.frame", "Input object")

  # .data required columns
  required_cols <- c(
    "MonitoringLocationIdentifier", "TADA.MonitoringLocationIdentifier",
    "MonitoringLocationName", "TADA.MonitoringLocationName",
    "LongitudeMeasure", "TADA.LongitudeMeasure",
    "LatitudeMeasure", "TADA.LatitudeMeasure",
    "MonitoringLocationTypeName", "TADA.MonitoringLocationTypeName",
    "MonitoringLocationDescriptionText", "TADA.NearbySites.Flag",
    "TADA.NearbySiteGroup"
  )
  # check .data has required columns
  TADA_CheckColumns(.data, required_cols)

  # filter only for locations with nearby sites
  .data <- .data %>%
    dplyr::filter(
      !is.na(TADA.NearbySites.Flag),
      TADA.NearbySites.Flag != "No nearby sites detected using input buffer distance."
    ) %>%
    # retain only required columns
    dplyr::select(dplyr::all_of(required_cols)) %>%
    # retain only unique records
    dplyr::distinct()

  return(.data)
}


#' Generate a random WQP dataset
#'
#' Retrieves data for a period of time in the past 20 years using
#' TADA_DataRetrieval. This function can be used for testing functions on
#' random datasets. Only random data sets with 10 or more results will be returned.
#' If a random dataset has fewer than 10 results, the function will automatically
#' create another random WQP query until a df with greater than 10 results is returned.
#'
#' @param number_of_days Numeric. The default is 1, which will query and retrieve
#' data for a random two-day period (e.g.startDate = "2015-04-21",
#' endDate = "2015-04-22"). The user can change this number to select additional days
#' if desired.
#'
#' @param choose_random_state Boolean (TRUE or FALSE). The default is FALSE.
#' If FALSE, the function will query all data in the WQP for the number_of_days
#' specified (national query). If TRUE, the function will select a random state
#' and only retrieve data for that state.
#'
#' @param autoclean Boolean (TRUE or FALSE). The default is TRUE.
#' If FALSE, the function will NOT apply the TADA_AutoClean as part of the
#' TADA_DataRetrieval. If TRUE, the function WILL apply TADA_AutoClean as part of
#' TADA_DataRetrieval.
#'
#' @return Random WQP dataset.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' df <- TADA_RandomTestingData(number_of_days = 1, choose_random_state = FALSE)
#' df <- TADA_RandomTestingData(number_of_days = 10, choose_random_state = TRUE)
#' df <- TADA_RandomTestingData(number_of_days = 5, choose_random_state = TRUE, autoclean = FALSE)
#' }
TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE,
                                   autoclean = TRUE) {
  get_random_data <- function(ndays = number_of_days, state_choice = choose_random_state,
                              ac = autoclean, ask = FALSE) {
    # choose a random day within the last 20 years
    twenty_yrs_ago <- Sys.Date() - 20 * 365
    random_start_date <- twenty_yrs_ago + sample(20 * 365, 1)
    # choose a random start date and add any number_of_days (set that as the end date)
    end_date <- random_start_date + ndays

    if (state_choice == TRUE) {
      load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA"))
      state <- sample(statecodes_df$STUSAB, 1)
    }

    if (state_choice == FALSE) {
      state <- "null"
    }

    print(c(
      startDate = as.character(random_start_date),
      endDate = as.character(end_date),
      statecode = state
    ))

    if (ac == TRUE) {
      dat <- TADA_DataRetrieval(
        startDate = as.character(random_start_date),
        endDate = as.character(end_date),
        statecode = state,
        applyautoclean = TRUE,
        ask = FALSE
      )
    }

    if (ac == FALSE) {
      dat <- TADA_DataRetrieval(
        startDate = as.character(random_start_date),
        endDate = as.character(end_date),
        statecode = state,
        applyautoclean = FALSE,
        ask = FALSE
      )
    }
    return(dat)
  }

  verify_random_data <- function() {
    df <- get_random_data()
    while (nrow(df) < 10) {
      df <- get_random_data()
    }
    return(df)
  }

  df <- verify_random_data()
  return(df)
}

#' Aggregate multiple result values to a min, max, or mean
#'
#' This function groups TADA data by user-defined columns and aggregates the
#' TADA.ResultMeasureValue to a minimum, maximum, or average value.
#'
#' @param .data A TADA dataframe
#' @param grouping_cols The column names used to group the data
#' @param agg_fun The aggregation function used on the grouped data. This can
#'   either be 'min', 'max', or 'mean'.
#' @param clean Boolean. Determines whether other measurements from the group
#'   aggregation should be removed or kept in the dataframe. If clean = FALSE,
#'   additional measurements are indicated in the
#'   TADA.ResultValueAggregation.Flag as "Used in aggregation function but not
#'   selected".
#'
#' @return A TADA dataframe with aggregated values combined into one row. If the
#'   agg_fun is 'min' or 'max', the function will select the row matching the
#'   aggregation condition and flag it as the selected measurement. If the
#'   agg_fun is 'mean', the function will select a random row from the
#'   aggregated rows to represent the metadata associated with the mean value,
#'   and gives the row a unique ResultIdentifier: the original ResultIdentifier
#'   with the prefix "TADA-". Function adds a TADA.ResultValueAggregation.Flag
#'   to indicate which rows have been aggregated.
#'
#' @export
#'
#' @examples
#' # Load example dataset
#' data(Data_6Tribes_5y)
#' # Select maximum value per day, site, comparable data identifier, result detection condition,
#' # and activity type code. Clean all non-maximum measurements from grouped data.
#' Data_6Tribes_5y_agg <- TADA_AggregateMeasurements(Data_6Tribes_5y,
#'   grouping_cols = c(
#'     "ActivityStartDate", "TADA.MonitoringLocationIdentifier",
#'     "TADA.ComparableDataIdentifier", "ResultDetectionConditionText",
#'     "ActivityTypeCode"
#'   ),
#'   agg_fun = "max", clean = TRUE
#' )
#'
#' # Calculate a mean value per day, site, comparable data identifier, result detection condition,
#' # and activity type code. Keep all measurements used to calculate mean measurement.
#' Data_6Tribes_5y_agg <- TADA_AggregateMeasurements(Data_6Tribes_5y,
#'   grouping_cols = c(
#'     "ActivityStartDate", "TADA.MonitoringLocationIdentifier",
#'     "TADA.ComparableDataIdentifier", "ResultDetectionConditionText",
#'     "ActivityTypeCode"
#'   ),
#'   agg_fun = "mean", clean = FALSE
#' )
TADA_AggregateMeasurements <- function(.data, grouping_cols = c("ActivityStartDate", "TADA.MonitoringLocationIdentifier", "TADA.ComparableDataIdentifier", "ResultDetectionConditionText", "ActivityTypeCode"), agg_fun = c("max", "min", "mean"), clean = TRUE) {
  TADA_CheckColumns(.data, grouping_cols)
  agg_fun <- match.arg(agg_fun)

  # Find multiple values in groups
  ncount <- .data %>%
    dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
    dplyr::summarise(ncount = length(ResultIdentifier))

  if (max(ncount$ncount) < 2) {
    print("No rows to aggregate.")
    return(.data)
  } else {
    dat <- merge(.data, ncount, all.x = TRUE)

    if (any(is.na(dat$TADA.ResultMeasureValue))) {
      "Warning: your dataset contains one or more rows where TADA.ResultMeasureValue = NA. Recommend removing these rows before proceeding. Otherwise, the function will not consider NAs in its calculations."
    }

    dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ncount == 1, "No aggregation needed", paste0("Used in ", agg_fun, " aggregation function but not selected"))
    multiples <- dat %>% dplyr::filter(ncount > 1)

    dat <- dat %>% dplyr::select(-ncount)

    if (agg_fun == "max") {
      out <- multiples %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
        dplyr::slice_max(order_by = TADA.ResultMeasureValue, n = 1, with_ties = FALSE)
      dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ResultIdentifier %in% out$ResultIdentifier, paste0("Selected as ", agg_fun, " aggregate value"), dat$TADA.ResultValueAggregation.Flag)
    }
    if (agg_fun == "min") {
      out <- multiples %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
        dplyr::slice_min(order_by = TADA.ResultMeasureValue, n = 1, with_ties = FALSE)
      dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ResultIdentifier %in% out$ResultIdentifier, paste0("Selected as ", agg_fun, " aggregate value"), dat$TADA.ResultValueAggregation.Flag)
    }
    if (agg_fun == "mean") {
      out <- multiples %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
        dplyr::mutate(TADA.ResultMeasureValue1 = mean(TADA.ResultMeasureValue, na.rm = TRUE)) %>%
        dplyr::slice_sample(n = 1) %>%
        dplyr::mutate(TADA.ResultValueAggregation.Flag = paste0("Selected as ", agg_fun, " aggregate value, with randomly selected metadata from a row in the aggregate group"))
      out <- out %>%
        dplyr::select(-TADA.ResultMeasureValue) %>%
        dplyr::rename(TADA.ResultMeasureValue = TADA.ResultMeasureValue1) %>%
        dplyr::mutate(ResultIdentifier = paste0("TADA-", ResultIdentifier))
      dat <- plyr::rbind.fill(dat, out)
    }

    if (clean == TRUE) {
      dat <- subset(dat, !dat$TADA.ResultValueAggregation.Flag %in% c(paste0("Used in ", agg_fun, " aggregation function but not selected")))
    }

    dat <- TADA_OrderCols(dat)
    print("Aggregation results:")
    print(table(dat$TADA.ResultValueAggregation.Flag))
    return(dat)
  }
}


#' Get bounding box JSON
#'
#' @param bbox A bounding box from the sf function st_bbox
#' @return A string containing bounding box JSON that can be passed to an
#' ArcGIS feature layer in the Input Geometry field
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_6Tribes_5y)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#'   c(
#'     xmin = min(Data_6Tribes_5y$TADA.LongitudeMeasure),
#'     ymin = min(Data_6Tribes_5y$TADA.LatitudeMeasure),
#'     xmax = max(Data_6Tribes_5y$TADA.LongitudeMeasure),
#'     ymax = max(Data_6Tribes_5y$TADA.LatitudeMeasure)
#'   ),
#'   crs = sf::st_crs(Data_6Tribes_5y)
#' )
#' # Get a string containing the JSON of the bounding box
#' getBboxJson(bbox)
#' }
getBboxJson <- function(bbox) {
  json <- paste0('{"xmin":', bbox[1], ',"ymin":', bbox[2], ',"xmax":', bbox[3], ',"ymax":', bbox[4], "}")
  return(json)
}

#' Create icon(s) to be used to represent points on a map feature layer
#' pchIcons is used within TADA_addPoints
#'
#' Uses the different plotting symbols available in R to create PNG files that can be used as markers on a map feature layer.
#'
#' @param pch Plot character code; either a single number or a vector of multiple numbers. Possible values available at https://www.geeksforgeeks.org/r-plot-pch-symbols-different-point-shapes-available-in-r/. Defaults to 1 (an open circle).
#' @param width Width of the plot character. Defaults to 30 pixels.
#' @param height Height of the plot character. Defaults to 30 pixels.
#' @param bg Background color of the plot character Defaults to transparent.
#' @param col Color(s) of the plot character(s). Defaults to black.
#' @param lwd Line width. Optional, defaults to NULL.
#' @return Path(s) to PNG file(s) in a temp folder on user's computer.
#'
#' @examples
#' \dontrun{
#' # Create three PNG files, a red circle, blue triangle, and yellow "X", each on a green background.
#' pchIcons(c(1, 2, 4), 40, 40, "green", c("red", "blue", "yellow"))
#' }
pchIcons <- function(pch = 1,
                     width = 30,
                     height = 30,
                     bg = "transparent",
                     col = "black",
                     lwd = NULL) {
  n <- length(pch)
  files <- character(n)
  for (i in seq_len(n)) {
    f <- tempfile(fileext = ".png")
    grDevices::png(f,
      width = width,
      height = height,
      bg = bg
    )
    graphics::par(mar = c(0, 0, 0, 0))
    graphics::plot.new()
    graphics::points(
      .5,
      .5,
      pch = pch[i],
      col = col[i],
      cex = min(width, height) / 8,
      lwd = lwd
    )
    grDevices::dev.off()
    files[i] <- f
  }
  files
}

#' Retrieve feature layer from ArcGIS REST service
#' getFeatureLayer is used by writeLayer to write feature layers to local files
#'
#' @param url URL of the layer REST service, ending with "/query". Example: https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query (American Indian Reservations)
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return ArcGIS feature layer
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_Nutrients_UT)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#'   c(
#'     xmin = min(Data_Nutrients_UT$TADA.LongitudeMeasure),
#'     ymin = min(Data_Nutrients_UT$TADA.LatitudeMeasure),
#'     xmax = max(Data_Nutrients_UT$TADA.LongitudeMeasure),
#'     ymax = max(Data_Nutrients_UT$TADA.LatitudeMeasure)
#'   ),
#'   crs = sf::st_crs(Data_Nutrients_UT)
#' )
#' # Get the American Indian Reservations feature layer,
#' # filtered by the bounding box for the Data_Nutrients_UT example dataset
#' getFeatureLayer("https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query", bbox)
#' }
getFeatureLayer <- function(url, bbox = NULL) {
  if (is.null(bbox)) {
    inputGeom <- NULL
  } else {
    inputGeom <- getBboxJson(bbox)
  }
  url <- paste0(url, "?where=1%3D1&outfields=*&returnGeometry=true&geometry=", inputGeom, "&f=geojson")
  layer <- sf::read_sf(url)
  return(layer)
}


#' Download a shapefile from an API and save it to a local folder, overwriting existing file if it exists
#' writeLayer is used by TADA_UpdateTribalLayers in TADAGeospatialRefLayers.R.
#'
#' @param url URL of the layer REST service, ending with "/query". Example: https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query (American Indian Reservations)
#' @param layerfilepath Local path to save the .shp file to
#'
#' @examples
#' \dontrun{
#' # Get the Oklahoma Tribal Statistical Areas feature layer and write
#' # local file to inst/extdata/OKTribe.shp
#' OKTribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4/query"
#' writeLayer(OKTribeUrl, "inst/extdata/OKTribe.shp")
#' }
writeLayer <- function(url, layerfilepath) {
  layer <- getFeatureLayer(url)
  # Attribute names can only be up to 10 characters long when saved to .dbf as part of sf::st_write.
  # They are truncated automatically but TOTALAREA_MI and TOTALAREA_KM will not be unique after being
  # truncated, so explicitly rename them first if they exist to avoid error.
  if ("TOTALAREA_MI" %in% colnames(layer)) {
    layer <- layer %>% dplyr::rename(
      TAREA_MI = TOTALAREA_MI,
      TAREA_KM = TOTALAREA_KM
    )
  }
  sf::st_write(layer, layerfilepath, delete_layer = TRUE)
}


#' Get a shapefile from a local folder, optionally crop it by a bounding box, and return it as a sf object
#' getLayer is used within TADA_addPolys and TADA_addPoints
#'
#' @param layerfilepath Local path to the .shp file for the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return sf object containing the layer
#'
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_6Tribes_5y_Harmonized)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#'   c(
#'     xmin = min(Data_6Tribes_5y_Harmonized$TADA.LongitudeMeasure),
#'     ymin = min(Data_6Tribes_5y_Harmonized$TADA.LatitudeMeasure),
#'     xmax = max(Data_6Tribes_5y_Harmonized$TADA.LongitudeMeasure),
#'     ymax = max(Data_6Tribes_5y_Harmonized$TADA.LatitudeMeasure)
#'   ),
#'   crs = sf::st_crs(Data_6Tribes_5y_Harmonized)
#' )
#' # Get the American Indian Reservations feature layer,
#' # filtered by the bounding box for the Data_6Tribes_5y_Harmonized
#' # example dataset
#' layerfilepath <- "extdata/AmericanIndian.shp"
#' getLayer(layerfilepath, bbox)
#' }
getLayer <- function(layerfilepath, bbox = NULL) {
  layer <- sf::st_read(system.file(layerfilepath, package = "EPATADA"))
  if (!(is.null(bbox))) {
    sf::sf_use_s2(FALSE)
    layer <- sf::st_make_valid(layer)
    layer <- sf::st_crop(layer, bbox)
  }
  return(layer)
}

#' Get text for tribal marker popup
#' getPopup is used within TADA_addPolys and TADA_addPoints
#'
#' @param layer A map feature layer
#' @param layername Name of the layer
#' @return Vector of strings to be used as the text for the popups when clicking on a tribal marker
#'
#' @examples
#' \dontrun{
#' # Get the Oklahoma Tribal Statistical Areas feature layer
#' layer <- getLayer("extdata/OKTribe.shp")
#' # Get popup text for individual markers
#' getPopup(layer, "Oklahoma Tribal Statistical Areas")
#' }
getPopup <- function(layer, layername) {
  text <- paste0("<strong>", layername, "</strong><p>")
  cols <-
    c(
      "TRIBE_NAME" = "Tribe Name",
      "PARCEL_NO" = "Parcel Number",
      "EPA_ID" = "EPA ID",
      "TYPE" = "Type"
    )

  for (i in seq(1, length(cols))) {
    if (names(cols[i]) %in% colnames(layer)) {
      text <- paste0(text, "<strong>", cols[i], "</strong>: ", layer[[names(cols[i])]], "<br>")
    }
  }
  return(text)
}


#' Add polygons from an ArcGIS feature layer to a leaflet map
#'
#' @param map A leaflet map
#' @param layerfilepath Local path to the .shp file for the layer
#' @param layergroup Name of the layer group
#' @param layername Name of the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return The original map with polygons from the feature layer added to it.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create a leaflet map
#' lmap <- leaflet::leaflet() %>%
#'   leaflet::addProviderTiles("Esri.WorldTopoMap", group = "World topo") %>%
#'   leaflet::addMapPane("featurelayers", zIndex = 300)
#' # Add the American Indian Reservations feature layer to the map
#' lmap <- TADA_addPolys(lmap, "extdata/AmericanIndian.shp", "Tribes", "American Indian Reservations")
#' lmap
#' }
TADA_addPolys <- function(map, layerfilepath, layergroup, layername, bbox = NULL) {
  layer <- getLayer(layerfilepath, bbox)
  if (is.null(layer)) {
    return(map)
  }
  lbbox <- sf::st_bbox(layer)
  if (is.na(lbbox[1])) {
    return(map)
  }
  areaColumn <- "ALAND_KM"
  if (!(areaColumn %in% colnames(layer))) {
    areaColumn <- "AREA_KM"
  }

  map <-
    leaflet::addPolygons(
      map,
      data = layer,
      color = "#A0522D",
      weight = 0.35,
      smoothFactor = 0.5,
      opacity = 1.0,
      fillOpacity = 0.2,
      fillColor = ~ leaflet::colorNumeric("Oranges", layer[[areaColumn]])(layer[[areaColumn]]),
      highlightOptions = leaflet::highlightOptions(
        color = "white",
        weight = 2,
        bringToFront = TRUE
      ),
      popup = getPopup(layer, layername),
      group = layergroup,
      options = leaflet::pathOptions(pane = "featurelayers")
    )
  return(map)
}

#' Add points from an ArcGIS feature layer to a leaflet map
#'
#' @param map A leaflet map
#' @param layerfilepath Local path to the .shp file for the layer
#' @param layergroup Name of the layer group
#' @param layername Name of the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return The original map with polygon from the feature layer added to it.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create a leaflet map
#' lmap <- leaflet::leaflet() %>%
#'   leaflet::addProviderTiles("Esri.WorldTopoMap", group = "World topo") %>%
#'   leaflet::addMapPane("featurelayers", zIndex = 300)
#' # Add the Virginia Federally Recognized Tribes feature layer to the map
#' lmap <- TADA_addPoints(
#'   lmap, "extdata/VATribe.shp",
#'   "Tribes", "Virginia Federally Recognized Tribes"
#' )
#' lmap
#' }
TADA_addPoints <- function(map, layerfilepath, layergroup, layername, bbox = NULL) {
  layer <- getLayer(layerfilepath, bbox)
  if (is.null(layer)) {
    return(map)
  }
  lbbox <- sf::st_bbox(layer)
  if (is.na(lbbox[1])) {
    return(map)
  }
  shapes <- c(2) # open triangle; for other options see https://www.geeksforgeeks.org/r-plot-pch-symbols-different-point-shapes-available-in-r/
  iconFiles <- pchIcons(shapes, width = 20, height = 20, col = c("#CC7722"), lwd = 2)
  map <- leaflet::addMarkers(
    map,
    data = layer,
    icon = ~ leaflet::icons(
      iconUrl = iconFiles[],
      popupAnchorX = 20,
      popupAnchorY = 0
    ),
    popup = getPopup(layer, layername),
    group = layergroup,
    options = leaflet::pathOptions(pane = "featurelayers")
  )
  return(map)
}

#' Create Characteristic/MeasureUnitCode/MethodSpeciation Ref
#'
#' Creates data frame of unique combinations of TADA.CharacteristicName,
#' TADA.ResultMeasure.MeasureUnitCode, ResultMeasure.MeasureUnitCode, and
#' TADA.MethodSpeciationName in a TADA data frame.
#'
#' @param .data A TADA data frame.
#'
#' @return A data frame with unique combinations of TADA.CharacteristicName,
#' TADA.ResultMeasure.MeasureUnitCode, ResultMeasure.MeasureUnitCode, and
#' TADA.MethodSpeciationName
#'
#' @export
#'
#' @examples
#' UniqueCharUnitSpecExample <-
#'   TADA_UniqueCharUnitSpeciation(Data_NCTCShepherdstown_HUC12)
TADA_UniqueCharUnitSpeciation <- function(.data) {
  required_cols <- c(
    "TADA.CharacteristicName", "TADA.ResultSampleFractionText",
    "TADA.MethodSpeciationName", "TADA.ResultMeasure.MeasureUnitCode",
    "TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode"
  )

  # Check to see if TADA_Autoclean has been run
  if (any(required_cols %in% colnames(.data)) == FALSE) {
    print("The dataframe does not contain the required fields. Running TADA_AutoClean to create required columns.")
    .data <- TADA_AutoClean(.data)
  }

  if (all(required_cols %in% colnames(.data)) == TRUE) {
    .data <- .data
  }

  # Create df of unique codes and characteristic names(from TADA.CharacteristicName and TADA.ResultMeasure.MeasureUnitCode) in TADA data frame
  data.units.result <- .data %>%
    dplyr::select(
      TADA.CharacteristicName, TADA.ResultMeasure.MeasureUnitCode,
      ResultMeasure.MeasureUnitCode, TADA.MethodSpeciationName
    ) %>%
    dplyr::filter(!is.na(TADA.ResultMeasure.MeasureUnitCode)) %>%
    dplyr::distinct()

  # Create df of unique codes and characteristic names(from TADA.CharacteristicName and TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) in TADA data frame
  data.units.det <- .data %>%
    dplyr::select(
      TADA.CharacteristicName, TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
      DetectionQuantitationLimitMeasure.MeasureUnitCode, TADA.MethodSpeciationName
    ) %>%
    dplyr::filter(!is.na(TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)) %>%
    dplyr::distinct() %>%
    dplyr::rename(
      TADA.ResultMeasure.MeasureUnitCode = TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
      ResultMeasure.MeasureUnitCode = DetectionQuantitationLimitMeasure.MeasureUnitCode
    )

  # Create combined df with all unique codes (both result and det units) and characteristic names
  data.units <- data.units.result %>%
    dplyr::full_join(data.units.det, by = c(
      "TADA.CharacteristicName", "TADA.ResultMeasure.MeasureUnitCode",
      "ResultMeasure.MeasureUnitCode", "TADA.MethodSpeciationName"
    )) %>%
    dplyr::distinct() %>%
    dplyr::group_by(TADA.CharacteristicName) %>%
    dplyr::mutate(NCode = length(unique(TADA.ResultMeasure.MeasureUnitCode))) %>%
    dplyr::filter(!is.na(TADA.ResultMeasure.MeasureUnitCode) |
      is.na(TADA.ResultMeasure.MeasureUnitCode) & NCode == 1) %>%
    dplyr::select(-NCode)

  return(data.units)
}


#' Create Color Palette For Use in Graphs and Maps
#'
#' Creates a consistent color palette for use in TADA visualizations. Consistent
#' color pairings can be utilized by setting col_pair = TRUE, in which each row
#' consists of two values for color outlines and fills. Currently, the palette
#' is utilizing the "Okabe-Ito" palette from base R via the palette.colors
#' function. The palette includes 9 colors by default. However, additional colors
#' can be added to the palette as needed as more complex visualization functions
#' are added to the TADA package.
#'
#' @param col_pair Boolean argument. Optional argument to define consistent color
#' pairings for outlines/fills of TADA figures defined by the row values in a dataframe.
#'
#' @return A color palette based on the "Okabe-Ito" palette, extended to 15 colors,
#'  with modifications for use in mapping and graphing functions
#'
#' @export
#'
#' @examples
#' TestColorPalette <- TADA_ColorPalette()
#' TestColorPalettePairings <- TADA_ColorPalette(col_pair = TRUE)
#' TestColorPalettePairings
TADA_ColorPalette <- function(col_pair = FALSE) {
  pal <- c(
    "#000000", "#835A00", "#DC851E", "#059FA4", "#56B4E9",
    "#005258", "#A1A522", "#F0E442", "#66A281", "#1E6F98",
    "#4F5900", "#813B00", "#CD758F", "#B686A1", "#999999"
  )

  # Defines two color columns to be used as the color pairings in a dataframe
  col1 <- c()
  col2 <- c()
  col_combo <- data.frame()

  # Each row defines the pairing of colors to be used if col_pair is TRUE
  if (col_pair == TRUE) {
    col1 <- c(pal[5], pal[3], pal[7], pal[14])
    col2 <- c(pal[10], pal[12], pal[11], pal[2])
    col_combo <- data.frame(col1, col2)
    pal <- col_combo
  }

  return(pal)
}


#' View TADA Color Palette
#'
#' View a swatch of the colors in the TADA Color palette labeled by color and
#' index number. TADA developers can reference this function when deciding which
#' colors to use in TADA visualizations. TADA users can also reference this
#' palette function to create their own visually consistent figures. TADA consistent
#' color pairings when col_pair = TRUE can be viewed in a matrix format.
#'
#' @param col_pair Boolean argument. Optional argument to view consistent color
#' pairings for outlines/fills of TADA figures defined by the row values in a dataframe.
#'
#' @return A color swatch figure based on the TADA color palette.
#'
#' @export
#'
#' @examples
#' TestViewPalette <- TADA_ViewColorPalette()
#' TestViewPalettePairing <- TADA_ViewColorPalette(col_pair = TRUE)
TADA_ViewColorPalette <- function(col_pair = FALSE) {
  # call TADA color palette
  pal <- TADA_ColorPalette()

  # determine length of color palette
  n <- length(pal)

  # create list of label colors, first one needs to be white to show up clearly
  label_colors <- rep("black", n)
  label_colors[1] <- "white"

  # create color swatch graphic
  graphics::par(mar = c(1, 0, 1, 0))
  swatch <- graphics::plot(1,
    type = "n", xlab = "", ylab = "", xlim = c(0.5, n + 0.5), ylim = c(0, 1),
    main = "TADA Palette", axes = FALSE
  )
  rect(1:n - 0.5, 0, n + 0.5, 1, col = pal, border = NA)
  text(x = 1:n, y = 0.5, labels = 1:n, pos = 3, col = label_colors)
  text(x = 1:n, y = 0.5 - 0.2, labels = pal, pos = 1, col = label_colors, cex = 0.7, srt = 90)

  col_combo <- TADA_ColorPalette(col_pair = TRUE)

  if (col_pair == TRUE) {
    swatch <- list()
    # Create a 2 x nrow/2 plotting matrix, can handle additional color pairings, in one view, if more are added in the future.
    graphics::par(mfrow = c(2, nrow(col_combo) / 2))
    # create list of label colors for pairs
    label_colors <- rep("black", 2)

    for (i in 1:nrow(col_combo)) {
      one_swatch <- graphics::plot(1,
        type = "n", xlab = "", ylab = "", xlim = c(0.5, 2.5), ylim = c(0, 1),
        main = paste0("TADA Palette Pair ", i), axes = FALSE
      )
      rect(1:2 - 0.5, 0, 2 + 0.5, 1, col = as.character(col_combo[i, ]), border = NA)
      # text(x = 1:2, y = 0.5 - 0.2, labels = 1:2, pos = 3, col = label_colors, cex = 0.75)
      text(x = 1:2 + 0.25, y = 0.5, labels = col_combo[i, ], pos = 2, col = label_colors, cex = 0.7)

      swatch[[i]] <- one_swatch
    }
  }

  graphics::par(mfrow = c(1, 1))
  swatch <- grDevices::recordPlot()

  return(swatch)
}

#' Remove NAs in Strings for Figure Titles and Axis Labels
#'
#' Returns a vector of string(s) that removes common NA strings
#' found in columns such as TADA.ComparableDataIdentifier. Can also
#' accommodate handling of certain NA texts found in any general
#' character string or a vector of strings.
#'
#' This function is meant as an internal function to remove NAs
#' from figure titles and axis labels for the TADA package.
#'
#' @param char_string Character argument. Could be a single string
#' or vector of strings that contains common "NA" strings
#' (ex: "(NA", "(NA)", "_NA", etc.)
#'
#' @return A vector string that has removed NAs from its value.
#'
#' @export
#'
#' @examples
#' # Removes NAs based on each TADA.ComparableDataIdentifier found in a dataset.
#' data(Data_Nutrients_UT)
#' UT_Titles <- TADA_CharStringRemoveNA(unique(Data_Nutrients_UT$TADA.ComparableDataIdentifier))
TADA_CharStringRemoveNA <- function(char_string) {
  # Checks if data type is a character string.
  if (!is.character(char_string)) {
    stop(paste0("TADA_CharStrignRemoveNA: 'char_string' argument is not a character string."))
  }

  # Converts character string to a vector.
  title_string <- as.vector(char_string)

  # Looks through each item in the vector and removes NAs from each.
  labs <- c()
  for (i in 1:length(char_string)) {
    labs[i] <- paste0(char_string[i], collapse = " ")
    labs[i] <- gsub("_NA|\\(NA|\\(NA)", "", labs[i])
    labs[i] <- gsub("_", " ", labs[i])
    labs <- as.vector(labs)
  }

  return(labs)
}

#' Create downloadable table
#'
#' This function creates a data table that can be downloaded as a .csv, .xlsx or .pdf.
#'
#' @param .data A data frame
#'
#' @return A data table with multiple download options (.csv, .xlsx or .pdf).
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # return ATTAINS parameter domain values
#' TADA_TableExport(rATTAINS::domain_values(domain_name = "ParameterName"))
#' }
TADA_TableExport <- function(.data = NULL) {
  if (is.null(.data)) {
    print("No dataframe provided. Please enter a dataframe to return")
  }

  data <- DT::datatable(.data,
    extensions = c("Buttons", "FixedColumns"),
    options = list(
      paging = TRUE,
      dom = "Bfrtip",
      autoWidth = TRUE,
      pageLength = 5,
      scrollX = TRUE,
      scrollCollapse = TRUE,
      buttons = c("copy", "csv", "excel", "pdf")
      # fixedColumns = list(leftColumns = 1
    ), class = "display"
  ) %>%
    DT::formatStyle(columns = colnames(.data), "fontSize" = "80%")

  return(data)
}
USEPA/TADA documentation built on April 12, 2025, 1:47 p.m.