#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
#' @export
#' @param lhs A value or the magrittr placeholder.
#' @param rhs A function call using the magrittr semantics.
#' @return The result of calling `rhs(lhs)`.
NULL
#' Silence print messages
#' @name quiet
#' @param x Code to silence
#' @return Function or code output with print messages silenced
quiet <- function(x) {
sink(tempfile())
on.exit(sink())
invisible(force(x))
}
# write global variables. Gets rid of global variable NOTE in check:
utils::globalVariables(c(
"TADA.ResultValueAboveUpperThreshold.Flag", "ActivityIdentifier", "ActivityMediaName",
"ActivityStartDate", "TADA.ResultValueBelowUpperThreshold.Flag",
"TADA.ResultValueBelowLowerThreshold.Flag", "CharacteristicName",
"Conversion.Factor", "Count", "Description", "FieldName", "FieldValue",
"MethodSpecationName", "MonitoringLocationIdentifier",
"OrganizationFormalName", "OrganizationIdentifier", "ProjectDescriptionText",
"ProjectFileUrl", "ProjectIdentifier",
"ProjectMonitoringLocationWeightingUrl", "ProjectName",
"QAPPApprovalAgencyName", "QAPPApprovedIndicator",
"ResultDetectionConditionText", "ResultMeasureValue",
"SamplingDesignTypeCode", "Source", "Status", "TADA.ContinuousData.Flag",
"TADA.SuspectCoordinates.Flag", "TADA.PotentialDupRowIDs.Flag", "TADA.QAPPDocAvailable",
"Target.Unit", "Type", "Value.Unit", "TADA.AnalyticalMethod.Flag",
"TADA.MethodSpeciation.Flag", "TADA.ResultUnit.Flag",
"TADA.SampleFraction.Flag", "YearSummarized", "where", "TADA.CharacteristicName",
"ResultIdentifier", "TADA.ResultMeasureValue", "n_sites",
"n_records", "statecodes_df", "STUSAB", "ActivityStartTime.Time", "numorgs", "dup_id",
"LatitudeMeasure", "TADA.ResultMeasureValueDataTypes.Flag", "Name", "TADA.Detection_Type",
"DetectionQuantitationLimitTypeName", "TADA.Limit_Type", "multiplier", "summ", "cf",
"LongitudeMeasure", "TADA.CensoredData.Flag", "Censored_Count",
"Status2", "ActivityTypeCode", "SampleCollectionEquipmentName",
"ResultTimeBasisText", "StatisticalBaseCode", "ResultValueTypeName",
"masked", "TADA.env", "Legend", "Fields", "desc", "WQXActivityType_Cached",
"TADA.ActivityType.Flag", "Code", "ActivityTypeCode", "ResultCount",
"tot_n", "MonitoringLocationName", "TADA.LatitudeMeasure",
"TADA.LongitudeMeasure", "median", "sd", "TADA.ComparableDataIdentifier",
"desc", "Legend", "roundRV", "TADA.DuplicateID", "maxRV", "within10",
"AllGroups", "Domain.Value.Status", "Char_Flag", "Comparable.Name",
"TADA.ResultMeasureValue1", "TADA.ResultSampleFractionText",
"TADA.MethodSpeciationName", "TADA.ResultMeasure.MeasureUnitCode",
"TADA.ActivityMediaName", "TADA.NutrientSummationGroup",
"SummationName", "SummationRank", "SummationFractionNotes", "SummationSpeciationNotes",
"SummationSpeciationConversionFactor", "SummationNote", "NutrientGroup",
"Target.Speciation", "TADA.NearbySiteGroups", "numres", "TADA.SingleOrgDupGroupID",
"TADA.MeasureQualifierCode.Flag", "TADA.MeasureQualifierCode.Def", "MeasureQualifierCode",
"value", "Flag_Column", "Data_NCTCShepherdstown_HUC12", "ActivityStartDateTime",
"TADA.MultipleOrgDupGroupID", "TADA.WQXVal.Flag", "Concat", ".", "MeasureQualifierCode.Split",
"TADA.Media.Flag", "ML.Media.Flag", "TADA.UseForAnalysis.Flag",
"Unique.Identifier", "Domain", "Note.Recommendation", "Conversion.Coefficient",
"Conversion.Coefficient", "Last.Change.Date", "Value", "Minimum", "Unique.Identifier",
"Domain", "ResultMeasure.MeasureUnitCode", "Comb", "CombList",
"TADA.Target.ResultMeasure.MeasureUnitCode", "TADA.WQXUnitConversionFactor",
"TADA.WQXUnitConversionCoefficient", "TADA.Target.MethodSpeciationName",
"flag", "NConvert", "MultUnits", "CharList", "CharUnit", "SingleNearbyGroup",
"TADA.MultipleOrgDuplicate", "TADA.ResultSelectedMultipleOrgs", "Maximum",
"OBJECTID", "GLOBALID", "assessmentunitidentifier", "index", "epsg",
"ResultMeasure.MeasureUnitCode", "TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode",
"DetectionQuantitationLimitMeasure.MeasureUnitCode", "NCode",
"ATTAINS.assessmentunitidentifier", "ATTAINS_AU", "TOTALAREA_MI", "TOTALAREA_KM",
"ATTAINS_AUs", "ARD_Category", "ActivityRelativeDepthName", "DepthsByGroup",
"DepthsPerGroup", "MeanResults", "MonitoringLocationTypeName", "N", "SecchiConversion",
"TADA.ActivityBottomDepthHeightMeasure.MeasureValue",
"TADA.ActivityDepthHeightMeasure.MeasureUnitCode", "TADA.ActivityDepthHeightMeasure.MeasureValue",
"TADA.CharacteristicsForDepthProfile TADA.ConsolidatedDepth",
"TADA.ConsolidatedDepth.Bottom TADA.ConsolidatedDepth.Unit", "TADA.DepthCategory.Flag",
"TADA.DepthProfileAggregation.Flag", "TADA.NResults",
"TADA.ResultDepthHeightMeasure.MeasureUnitCode", "TADA.ResultDepthHeightMeasure.MeasureValue",
"YAxis.DepthUnit", "TADA.CharacteristicsForDepthProfile", "TADA.ConsolidatedDepth",
"TADA.ConsolidatedDepth.Bottom", "TADA.ConsolidatedDepth.Unit", "col2rgb",
"palette.colors", "rect", "rgb", "text", "CodeNoSpeciation", "ResultMeasure.MeasureUnitCode.Upper",
"TADA.MonitoringLocationIdentifier", "StringA", "StringB", "MeasureUnitCode.match",
"TADA.ActivityTopDepthHeightMeasure.MeasureValue", "group_id", "time_diff_lead", "time_diff_lag",
"NResults", "missing.group", "TADA.PairingGroup", "TADA.PairingGroup.Rank", "timediff",
"TADA.MonitoringLocationName", "TADA.MonitoringLocationTypeName",
"ATTAINS.submissionid", "HorizontalCoordinateReferenceSystemDatumName",
"NCount", "NHD.catchmentareasqkm", "NHD.comid", "NHD.nhdplusid", "NHD.resolution",
"areasqkm", "assessmentUnitIdentifier", "catchmentareasqkm", "comid",
"featureid", "geometry", "nhdplusid", "waterTypeCode", "TADA.NearbySiteGroup",
"TADA.MonitoringLocationIdentifier.New", "TADA.NearbySites.Flag", "CountSites", "Group",
"Matrix", "n_id", "OrgRank", "rank.default", "Site", "TADA.LatitudeMeasure.New",
"TADA.LongitudeMeasure.New", "TADA.MonitoringLocationName.New",
"TADA.MonitoringLocationTypeName.New", "df_number", "ASSESSMENT_UNIT_ID",
"ATTAINS.FlagParameterName", "ATTAINS.FlagUseName", "ATTAINS.ParameterName",
"CRITERIATYPEAQUAHUMHLTH", "CRITERIATYPEFRESHSALTWATER", "CRITERIATYPE_ACUTECHRONIC",
"CRITERIATYPE_WATERORG", "CRITERION_VALUE", "ENTITY_ABBR", "EPA304A.PollutantName",
"IncludeOrExclude", " MONITORING_DATA_LINK_TEXT", "MONITORING_DATA_LINK_TEXT.New",
"MS_LOCATION_ID", "MS_ORG_ID", "MonitoringDataLinkText", "OrgIDForURL", "POLLUTANT_NAME",
"ProviderName", "TADA.SingleOrgDup.Flag", "UNIT_NAME", "URLencode", "USE_CLASS_NAME_LOCATION_ETC",
"assessment_unit_identifier", "monitoring_data_link_text", "monitoring_location_identifier",
"monitoring_organization_identifier", "monitoring_stations", "organization_identifier",
"organization_identifier.y", "parameter", "use_name", "use_name.y",
"MONITORING_DATA_LINK_TEXT", "PARCEL_NO", "TRIBE_NAME", "everything",
"resultCount", "tribal_area", "txtProgressBar"
))
# global variables for tribal feature layers used in TADA_OverviewMap in Utilities.R
AKAllotmentsUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/0/query"
AKVillagesUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/1/query"
AmericanIndianUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query"
OffReservationUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/3/query"
OKTribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4/query"
VATribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/5/query"
#' Decimal Places
#'
#' for numeric data type
#'
#' @param x Numeric data field from TADA profile
#'
#' @return Number of values to the right of the decimal point for numeric
#' type data.
TADA_DecimalPlaces <- function(x) {
if (abs(x - round(x)) > .Machine$double.eps^0.5) {
nchar(strsplit(sub("0+$", "", as.character(x)), ".", fixed = TRUE)[[1]][[2]])
} else {
return(0)
}
}
#' Insert Breaks
#'
#' This function inserts a new line into a string when it exceeds a
#' user-specified length. New lines are added to spaces in the string. This is
#' intended to make plot legends more readable and tidy.
#'
#' @param x A vector of strings
#' @param len The maximum character length a string can be before the function
#' searches for the best space to insert a new line.
#'
#' @return The same vector of strings with new lines added where appropriate.
#'
#' @export
TADA_InsertBreaks <- function(x, len = 50) {
if (nchar(x) > len) {
multiples <- floor(nchar(x) / len)
lens <- seq(len, len * multiples, by = len)
spaces <- unlist(gregexpr(" ", x))
if (max(spaces) > len) {
spots <- sapply(lens, function(x) spaces[min(which(spaces > x))])
for (i in 1:length(spots)) {
stringi::stri_sub(x, spots[i] + (i - 1), spots[i]) <- "\n "
}
}
}
return(x)
}
#' Check Type
#'
#' This function checks if the inputs to a function are of the expected type. It
#' is used at the beginning of TADA functions to ensure the
#' inputs are suitable.
#'
#' @param arg An input argument to check
#' @param type Expected class of input argument
#' @param paramName Optional name for argument to use in error message
TADA_CheckType <- function(arg, type, paramName) {
if ((type %in% class(arg)) == FALSE) {
# if optional parameter name not specified use arg in errorMessage
if (missing(paramName)) {
paramName <- arg
}
errorMessage <- paste(paramName, " must be of class '", type, "'", sep = "")
return(stop(errorMessage))
}
}
#' Check Columns
#'
#' This function checks if the expected column names are in the dataframe. It is
#' used at the beginning of TADA functions to ensure the input data frame is
#' suitable (i.e. is either the full physical/chemical results profile
#' downloaded from WQP or the TADA profile template downloaded from the EPA TADA
#' webpage.)
#'
#' @param .data A dataframe
#' @param expected_cols A vector of expected column names as strings
TADA_CheckColumns <- function(.data, expected_cols) {
if (all(expected_cols %in% colnames(.data)) == FALSE) {
stop("The dataframe does not contain the required fields. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
}
}
#' TADA_ConvertSpecialChars
#'
#' This function will screen a column of the user's choice for special
#' characters. It creates a NEW column that describes the content of the column
#' prior to conversion to numeric (named "TADA.COLUMN NAME DataTypes.Flag"). It
#' also creates a NEW column to hold the new, numeric format (named "TADA.COLUMN
#' NAME"). This function will successfully convert some special character
#' formats to numeric: whitespace, >, <, ~, %, and commas are removed before
#' converting a result value to numeric. Result values in the format # - # are
#' converted to an average of the two numbers. Result values
#' containing any other text or non-numeric characters become NA in
#' the newly created "TADA.COLUMN NAME" and labeled accordingly in "TADA.COLUMN
#' NAME DataTypes.Flag".
#'
#'
#' @param .data A TADA profile object
#' @param col A character column to be converted to numeric
#'
#' @param percent.ave Boolean argument; default is percent.ave = TRUE. When
#' clean = TRUE, any percent range values will be averaged. When
#' percent.ave = FALSE, percent range values are not averaged, but are flagged.
#'
#' @return Returns the original dataframe with two new columns: the input column
#' with the prefix "TADA.", which holds the numeric form of the original column,
#' and "TADA.COLUMN NAME DataTypes.Flag", which has text describing the type of
#' data contained within the column of interest, including "Numeric",
#' "Less Than" (<), "Greater Than" (>), "Approximate Value" (~), "Text" (A-z),
#' "Percentage" (%), "Comma-Separated Numeric" (#,###), and
#' "Numeric Range - Averaged" (# - #).
#'
#' @export
#'
#' @examples
#' HandleSpecialChars_ResultMeasureValue <-
#' TADA_ConvertSpecialChars(Data_Nutrients_UT, "ResultMeasureValue")
#' unique(HandleSpecialChars_ResultMeasureValue$
#' TADA.ResultMeasureValueDataTypes.Flag)
#'
#' HandleSpecialChars_DetLimMeasureValue <-
#' TADA_ConvertSpecialChars(
#' Data_Nutrients_UT,
#' "TADA.DetectionQuantitationLimitMeasure.MeasureValue"
#' )
#' unique(HandleSpecialChars_DetLimMeasureValue$
#' TADA.DetectionQuantitationLimitMeasure.MeasureValueDataTypes.Flag)
TADA_ConvertSpecialChars <- function(.data, col, percent.ave = TRUE) {
if (!col %in% names(.data)) {
stop("Suspect column name specified for input dataset.")
}
# Define new column names
numcol <- paste0("TADA.", col)
flagcol <- paste0("TADA.", col, "DataTypes.Flag")
# Create dummy columns for easy handling in function
chars.data <- .data
names(chars.data)[names(chars.data) == col] <- "orig"
chars.data <- chars.data %>%
dplyr::select(-tidyselect::any_of(c(col, numcol, flagcol)))
chars.data$masked <- chars.data$orig
# Add percentage character to dissolved oxygen saturation ResultMeasureValue
# so percentage and percentage - range averaged can be identified correctly
if (col == "ResultMeasureValue") {
do.units <- c("%", "% SATURATN")
chars.data$masked <- ifelse(chars.data$CharacteristicName == "Dissolved oxygen (DO)" & chars.data$ResultMeasure.MeasureUnitCode %in% do.units,
paste(chars.data$masked, "%"), chars.data$masked
)
# updates percentage units where NA
chars.data$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
grepl("%", chars.data$masked), "%", chars.data$ResultMeasure.MeasureUnitCode
)
# TADA.ResultMeasure.MeasureUnitCode to uppercase
chars.data$TADA.ResultMeasure.MeasureUnitCode <- toupper(chars.data$TADA.ResultMeasure.MeasureUnitCode)
}
# If column is already numeric, just discern between NA and numeric
if (is.numeric(chars.data$orig)) {
clean.data <- chars.data %>%
dplyr::mutate(flag = dplyr::case_when(
is.na(masked) ~ as.character("NA - Not Available"),
TRUE ~ as.character("Numeric")
))
} else {
chars.data$masked <- gsub(" ", "", chars.data$masked) # get rid of white space for subsequent sorting
# Detect special characters in column and populate new flag column with descriptor
# of the specific type of character/data type
clean.data <- chars.data %>%
dplyr::mutate(
flag = dplyr::case_when(
is.na(masked) ~ as.character("NA - Not Available"),
(!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"),
(grepl("<", masked) == TRUE) ~ as.character("Less Than"),
(grepl(">", masked) == TRUE) ~ as.character("Greater Than"),
(grepl("~", masked) == TRUE) ~ as.character("Approximate Value"),
(grepl("[A-Za-z]", masked) == TRUE) ~ as.character("Text"),
(grepl("%", masked) == TRUE) ~ as.character("Percentage"),
(grepl(",", masked) == TRUE) ~ as.character("Comma-Separated Numeric"),
(grepl("\\d\\-\\d", masked) == TRUE) ~ as.character("Numeric Range - Averaged"),
(grepl("([1-9]|[1-9][0-9]|100)-([1-9]|[1-9][0-9]|100)%", masked) == TRUE) ~ as.character("Percentage Range - Averaged"),
# because * is a special character you have to escape\\ it:
(grepl("\\*", masked) == TRUE) ~ as.character("Approximate Value"),
(!stringi::stri_enc_mark(masked) %in% c("ASCII")) ~ as.character("Non-ASCII Character(s)"),
TRUE ~ "Coerced to NA"
),
flag = ifelse(flag == "Greater Than" & grepl("%", masked) & grepl("-", masked),
"Percentage Range - Averaged", flag
),
flag = ifelse(flag == "Less Than" & grepl("%", masked) & grepl("-", masked),
"Percentage Range - Averaged", flag
)
)
}
if (percent.ave == FALSE) {
num.range.filter <- c("Numeric Range - Averaged")
}
if (percent.ave == TRUE) {
num.range.filter <- c("Numeric Range - Averaged", "Percentage Range - Averaged")
}
# Result Values that are numeric ranges with the format #-# are converted to an average of the two numbers expressed in the range.
if (any(clean.data$flag %in% num.range.filter)) {
numrange <- subset(clean.data, clean.data$flag %in% num.range.filter)
notnumrange <- subset(clean.data, !clean.data$flag %in% num.range.filter)
numrange <- numrange %>%
dplyr::mutate(
masked = stringr::str_remove(masked, "[1-9]\\)"),
masked = stringr::str_remove(masked, "%"),
masked = stringr::str_remove(masked, ">"),
masked = stringr::str_remove(masked, "<")
) %>%
tidyr::separate(masked, into = c("num1", "num2"), sep = "-", remove = TRUE) %>%
dplyr::mutate_at(c("num1", "num2"), as.numeric)
numrange$masked <- as.character(rowMeans(numrange[, c("num1", "num2")], na.rm = TRUE))
numrange <- numrange[, !names(numrange) %in% c("num1", "num2")] %>%
dplyr::mutate(masked = ifelse(flag == "Percentage Range - Average", paste(masked, "%", sep = ""), masked))
clean.data <- plyr::rbind.fill(notnumrange, numrange)
}
# In the new TADA column, convert to numeric and remove some specific special
# characters.
clean.data$masked <- suppressWarnings(as.numeric(stringr::str_replace_all(
clean.data$masked, c("<" = "", ">" = "", "~" = "", "%" = "", "\\*" = "", "1\\)" = "")
)))
# this updates the DataTypes.Flag to "NA - Not Available" if NA
clean.data$flag <- ifelse(
is.na(clean.data$flag),
"NA - Not Available",
clean.data$flag
)
# remove columns to be replaced
clean.data <- clean.data %>%
dplyr::select(!(tidyselect::any_of(numcol)), !(tidyselect::any_of(flagcol)))
# Rename to original column name, TADA column name, and flag column name
names(clean.data)[names(clean.data) == "orig"] <- col
names(clean.data)[names(clean.data) == "masked"] <- numcol
names(clean.data)[names(clean.data) == "flag"] <- flagcol
clean.data <- TADA_OrderCols(clean.data)
return(clean.data)
}
#' Substitute Preferred Characteristic Name for Deprecated Names
#'
#' This function uses the WQX Characteristic domain table to substitute
#' deprecated (i.e. retired and/or suspect) Characteristic Names with the new
#' name in the TADA.CharacteristicName column. TADA_SubstituteDeprecatedChars is
#' run within TADA_AutoClean, which runs within TADA_DataRetrieval and
#' (if autoclean = TRUE) in TADA_BigDataRetrieval. Therefore, deprecated
#' characteristic names are harmonized to the new name automatically upon data
#' retrieval. TADA_SubstituteDeprecatedChars can also be used by itself on a
#' user supplied dataset that is in the WQX/WQP format, if desired. This
#' solution works for both EPA WQX and USGS NWIS provided data.
#'
#' Enter ?TADA_GetCharacteristicRef() to review a list of all WQX
#' characteristics, the including deprecated names (Char_Flag). This can be
#' used as a crosswalk between the deprecated names (CharacteristicName) and
#' their new names (Comparable.Name).
#'
#' @param .data TADA dataframe
#'
#' @return Input TADA dataframe with substituted characteristic names in
#' TADA.CharacteristicName column. Original columns are unchanged.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # download nutrient data in MT from 2022 and set autoclean = FALSE
#' df <- TADA_DataRetrieval(
#' startDate = "2022-01-01",
#' endDate = "2022-12-31",
#' characteristicType = "Nutrient",
#' statecode = "MT",
#' applyautoclean = FALSE, ask = FALSE
#' )
#' df2 <- TADA_SubstituteDeprecatedChars(df)
#' # in this example, "Inorganic nitrogen (nitrate and nitrite)" is a USGS NWIS
#' # characteristic that is deprecated and
#' # "Phosphate-phosphorus***retired***use Total Phosphorus, mixed forms"
#' # is a deprecated WQX name. Both are are transformed to their new names.
#' # review characteristic names before and after transformation
#' unique(df2$CharacteristicName)
#' unique(df2$TADA.CharacteristicName)
#'
#' df3 <- TADA_DataRetrieval(
#' startDate = "2022-01-01", endDate = "2022-12-31",
#' characteristicType = "Nutrient", statecode = "WY", applyautoclean = FALSE,
#' ask = FALSE
#' )
#' df4 <- TADA_SubstituteDeprecatedChars(df3)
#' unique(df4$CharacteristicName)
#' unique(df4$TADA.CharacteristicName)
#' }
TADA_SubstituteDeprecatedChars <- function(.data) {
TADA_CheckColumns(.data, expected_cols = c("CharacteristicName"))
if ("TADA.CharacteristicName" %in% colnames(.data)) {
.data <- .data
} else {
# create uppercase version of original CharacteristicName
.data$TADA.CharacteristicName <- toupper(.data$CharacteristicName)
}
# read in characteristic reference table with deprecation information, filter to deprecated terms and for "retired" in CharacteristicName.
# remove all characters after first "*" in CharacteristicName and remove any leading or trailing white space to make compatible with deprecated NWIS CharacteristicName.
nwis.table <- utils::read.csv(system.file("extdata", "WQXCharacteristicRef.csv", package = "EPATADA")) %>%
dplyr::filter(
Char_Flag == "Deprecated",
grepl("retired", CharacteristicName)
) %>%
dplyr::mutate(CharacteristicName = trimws(stringr::str_split(CharacteristicName, "\\*", simplify = T)[, 1]))
# read in characteristic reference table with deprecation information and filter to deprecated terms.
# join with deprecated NWIS CharacteristicName data.frame.
ref.table <- utils::read.csv(system.file("extdata", "WQXCharacteristicRef.csv", package = "EPATADA")) %>%
dplyr::filter(Char_Flag == "Deprecated") %>%
rbind(nwis.table)
rm(nwis.table)
# merge to dataset
.data <- merge(.data, ref.table, all.x = TRUE)
# if CharacteristicName is deprecated and comparable name is not blank (NA), use the provided Comparable.Name. Otherwise, keep TADA.CharacteristicName as-is.
.data$TADA.CharacteristicName <- ifelse(!is.na(.data$Char_Flag) & !.data$Comparable.Name %in% c(""), .data$Comparable.Name, .data$TADA.CharacteristicName)
howmany <- length(.data$Char_Flag[!is.na(.data$Char_Flag)])
if (howmany > 0) {
chars <- unique(.data$CharacteristicName[!is.na(.data$Char_Flag)])
chars <- paste0(chars, collapse = "; ")
print(paste0(howmany, " results in your dataset have one of the following deprecated characteristic names: ", chars, ". These names have been substituted with the updated preferred names in the TADA.CharacteristicName field."))
} else {
print("No deprecated characteristic names found in dataset.")
}
.data <- .data %>% dplyr::select(-Char_Flag, -Comparable.Name)
.data <- TADA_OrderCols(.data)
return(.data)
}
#' Create TADA.ComparableDataIdentifier Column
#'
#' This utility function creates the TADA.ComparableDataIdentifier column by pasting
#' together TADA.CharacteristicName, TADA.ResultSampleFractionText, TADA.MethodSpeciationName,
#' and TADA.ResultMeasure.MeasureUnitCode.
#'
#' @param .data TADA dataframe
#'
#' @return Input TADA dataframe with added TADA.ComparableDataIdentifier column.
#'
#' @export
TADA_CreateComparableID <- function(.data) {
TADA_CheckColumns(.data,
expected_cols = c(
"TADA.CharacteristicName",
"TADA.ResultSampleFractionText",
"TADA.MethodSpeciationName",
"TADA.ResultMeasure.MeasureUnitCode"
)
)
.data$TADA.ComparableDataIdentifier <-
paste(.data$TADA.CharacteristicName,
.data$TADA.ResultSampleFractionText,
.data$TADA.MethodSpeciationName,
.data$TADA.ResultMeasure.MeasureUnitCode,
sep = "_"
)
return(.data)
}
#' Convert a delimited string to the format used by WQX 3.0 profiles for
#' one-to-manys
#'
#' This utility function takes a delimited string of entities, and a delimiter
#' (which defaults to a comma) and returns a new string in the WQX 3.0 format
#' of c("StringA","StringB").
#'
#' @param delimited_string Character argument. Should be a string delimited
#' by the character passed in the delimiter parameter.
#'
#' @param delimiter Character argument The character used to delimit the
#' string passed in delimited_string. Defaults to a comma.
#'
#' @return String.
#'
#' @export
TADA_FormatDelimitedString <- function(delimited_string, delimiter = ",") {
esc_chars <- c("|", "^", "&", ".", "!", "?", "\\", "*", "-", "+", ">", "<")
if (delimiter %in% esc_chars) {
delimiter <- paste0("\\", delimiter)
}
return(paste0('["', gsub(delimiter, '","', delimited_string), '"]'))
}
#' Identify and group nearby monitoring locations (UNDER ACTIVE DEVELOPMENT)
#'
#' This function takes a TADA dataset and identifies the NHD catchments that
#' each MonitoringLocation is in. Within each group of MonitoringLocations in
#' the same catchment, a distance matrix is created and an adjacency matrix
#' is used to identify groups of nearby sites within the same catchment.
#' Groups of nearby sites are given a new TADA.MonitoringLocationIdentifier
#' which is created by concatenating the original
#' TADA.MonitoringLocationIdentifiers of all sites within the group. Two
#' additional columns, TADA.NearbySiteGroup and TADA.NearbySites.Flag are added.
#' TADA.NearbySiteGroup contains a unique numeric value for each group of sites
#' within the same catchment. TADA.NearbySites.Flag identifies whether or not
#' a result is from a grouped site or not and for grouped sites identifies how
#' the TADA prefixed metadata columns (TADA.MonitoringLocationName,
#' TADA.MonitoringLocationTypeName, TADA.LongitudeMeasure, and
#' TADA.LatitudeMeasure) were determined.
#'
#' @param .data TADA dataframe OR TADA sites dataframe.
#'
#' @param dist_buffer Numeric. The maximum distance (in meters) two sites can be
#' from one another to be considered "nearby" and grouped together.
#'
#' @param nhd_res Character argument to determine whether the NHD catchments
#' used should be high ("Hi") or medium ("Med") res. Default = "Hi" for
#' consistency with other TADA geospatial functions.
#'
#' @param org_hierarchy Vector of organization identifiers that acts as the
#' order in which the function should select representative metadata for
#' grouped sites based on the organization that collected the data. If left
#' blank, the function does not factor organization in to the metadata
#' selection process. When a vector is provided, the metadata will first be
#' selected by organization and the "meta_select" argument will only be
#' applied in cases where more than one set of metadata per site grouping are
#' available from the highest ranking organization available.
#'
#' @param meta_select Character argument to determine how metadata should be
#' selected if no org_hierarchy is specified or if multiple options for metadata
#' from the same organization exist. Options are "oldest", which selects the
#' metadata associated with the oldest result from the grouped nearby sites,
#' "newest", which selects the metadata associated with the newest result from
#' the grouped nearby sites, "count" which selects the metadata associated with
#' the greatest number of results, and "random" which selects random metadata
#' from the site group. The default is meta_select = "random".
#'
#' @return Input dataframe with a TADA.SiteGroup column that indicates the
#' nearby site group each monitoring location belongs to. Grouped sites are
#' concatenated in the TADA.MonitoringLocationIdentifier column
#' (e.g. "USGS-10010025","USGS-10010026" enclosed in square brackets []).
#' This JSON array is the new TADA monitoring location ID for the grouped sites.
#' TADA.MonitoringLocationIdentifier can be leveraged to analyze data from
#' nearby sites together (as the same general location). Related metadata,
#' including TADA.MonitoringLocationName, TADA.LatitudeMeasure,
#' TADA.LongitudeMeasure, and TADA.MonitoringLocationTypeName are added to the
#' input df. Meta data selection is determined by user inputs as users may
#' provide an organization hierarchy to determine which organization's
#' metadata should be preferentially selected and further specify whether
#' metadata should be selected: randomly, by the oldest or newest sampling date,
#' or by the site with the greatest number of overall results in the TADA df.
#'
#' @export
#'
#' @examples
#' GroupNearbySites_100m <- TADA_FindNearbySites(Data_Nutrients_UT)
#' GroupNearbySites_10m <- TADA_FindNearbySites(Data_Nutrients_UT,
#' dist_buffer = 10
#' )
TADA_FindNearbySites <- function(.data, dist_buffer = 100,
nhd_res = "Hi",
org_hierarchy = "none",
meta_select = "random") {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# .data required columns
required_cols <- c(
"TADA.MonitoringLocationIdentifier",
"TADA.LongitudeMeasure",
"TADA.LatitudeMeasure"
)
# check .data has required columns
TADA_CheckColumns(.data, required_cols)
rm(required_cols)
# retain only necessary columns unique Monitoring Locations
data_unique_mls <- .data %>%
dplyr::select(
TADA.MonitoringLocationIdentifier, TADA.LongitudeMeasure, TADA.LatitudeMeasure,
HorizontalCoordinateReferenceSystemDatumName
) %>%
dplyr::rename(
LongitudeMeasure = TADA.LongitudeMeasure,
LatitudeMeasure = TADA.LatitudeMeasure,
MonitoringLocationIdentifier = TADA.MonitoringLocationIdentifier
) %>%
dplyr::distinct()
# convert to sf object
data_unique_mls <- TADA_MakeSpatial(data_unique_mls)
# fetch nhdplus catchment information
nhd_catchments <- fetchNHD(data_unique_mls, resolution = nhd_res)
# join catchment data to TADA df, count how many times each catchment is included in df and
# retain only rows containing catchments which are included more than once in the df
data_unique_mls <- data_unique_mls %>%
sf::st_join(nhd_catchments, left = TRUE) %>%
dplyr::rename(
TADA.MonitoringLocationIdentifier = MonitoringLocationIdentifier,
TADA.LongitudeMeasure = LongitudeMeasure,
TADA.LatitudeMeasure = LatitudeMeasure
) %>%
dplyr::group_by(NHD.nhdplusid) %>%
dplyr::mutate(n_id = length(TADA.MonitoringLocationIdentifier)) %>%
dplyr::filter(n_id > 1) %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(NHD.nhdplusid))
if (nrow(data_unique_mls) == 0) { # #if no groups, give a TADA.NearbySiteGroup column filled with
# "No nearby sites"
print("TADA_FindNearbySites: No nearby sites detected using input buffer distance. Columns for TADA.NearbySitesFlag and TADA.NearbySiteGroup added for tracking purposes.")
.data <- .data %>%
dplyr::mutate(
TADA.NearbySites.Flag = "No nearby sites detected using input buffer distance.",
TADA.NearbySiteGroup = NA
)
rm(data_unique_mls, nhd_catchments)
return(.data)
}
# check to see if any site groups are found
if (nrow(data_unique_mls) > 1) {
# remove intermediate object
rm(nhd_catchments)
# need to split into separate dfs based on nhdplusid group
df_nhdgroups_list <- split(data_unique_mls, data_unique_mls$NHD.nhdplusid)
rm(data_unique_mls)
# prepare dfs for distance matrix
df_nhdgroups_prep <- df_nhdgroups_list %>%
purrr::map(~ .x %>%
dplyr::select(
TADA.MonitoringLocationIdentifier,
TADA.LongitudeMeasure,
TADA.LatitudeMeasure
))
rm(df_nhdgroups_list)
# create a distance matrix in meters
dist.mats <- purrr::map(df_nhdgroups_prep, ~ {
dist.matrix <- as.matrix(sf::st_distance(.x)) # Great Circle distance since in lat/lon
rownames(dist.matrix) <- .x$TADA.MonitoringLocationIdentifier
colnames(dist.matrix) <- .x$TADA.MonitoringLocationIdentifier
dist.matrix
})
# remove intermediate object
rm(df_nhdgroups_prep)
# remove units from distance matrix
dist.mats <- purrr::map(dist.mats, ~ units::drop_units(.x))
# convert distances to those within buffer (1) and beyond buffer (0)
binary.mats <- purrr::map(dist.mats, function(mat) {
bin.mat <- apply(mat, 2, function(x) as.integer(x <= dist_buffer))
diag(bin.mat) <- 0
rownames(bin.mat) <- rownames(mat)
colnames(bin.mat) <- colnames(mat)
#
return(bin.mat)
})
rm(dist.mats)
# loop through distance matrix and extract site groups
find_groups <- function(matrix) {
# create adjacency graph
adj.graph <- igraph::graph_from_adjacency_matrix(matrix, mode = "undirected", diag = FALSE)
# find connected sites
comp.results <- igraph::components(adj.graph)
# create site group dfs
group_sites <- data.frame(
Site = names(comp.results$membership),
Group = comp.results$membership,
row.names = NULL
)
return(group_sites)
}
# apply site grouping function to matrix
site.groups.list <- purrr::map(binary.mats, find_groups)
site.groups.list <- purrr::imap(site.groups.list, ~ .x %>%
dplyr::mutate(df_number = .y))
rm(find_groups, binary.mats)
# create df of all groups and create unique id for each group
combined.group.df <- dplyr::bind_rows(site.groups.list) %>%
dplyr::group_by(Group, df_number) %>%
dplyr::mutate(Count = length(Site)) %>%
dplyr::filter(Count > 1) %>%
# create new TADA.MonitoringLocationIdentifier
dplyr::mutate(
TADA.MonitoringLocationIdentifier.New = paste(Site, collapse = ", "),
TADA.MonitoringLocationIdentifier.New = paste0(
"[",
TADA.MonitoringLocationIdentifier.New,
"]"
),
TADA.NearbySiteGroup = dplyr::cur_group_id()
) %>%
dplyr::ungroup()
rm(site.groups.list)
# create crosswalk of TADA.MonitoringLocationIdentifer and new TADA.MonitoringLocationIdentifier
ml.crosswalk <- combined.group.df %>%
dplyr::select(Site, TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup) %>%
dplyr::rename(TADA.MonitoringLocationIdentifier = Site) %>%
dplyr::distinct()
rm(combined.group.df)
# create df of grouped sites, including all activity start dates for the sites
grouped.sites <- ml.crosswalk %>%
dplyr::left_join(.data, by = dplyr::join_by(TADA.MonitoringLocationIdentifier)) %>%
dplyr::select(
TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup,
TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName, ActivityStartDate, OrganizationIdentifier
) %>%
dplyr::distinct()
# create a df of unique grouped sites, do not include any activity start dates
grouped.no.dates <- grouped.sites %>%
dplyr::select(-ActivityStartDate) %>%
dplyr::distinct()
# create list of orgs from TADA df
all.orgs <- unique(.data$OrganizationIdentifier)
# compare list of orgs from TADA df to user supplied org_hierachy to find missing orgs
missing.orgs <- setdiff(all.orgs, org_hierarchy)
# create string for flagging based on meta_select
if (meta_select == "random") {
meta.string <- "random selection"
}
if (meta_select == "oldest") {
meta.string <- "oldest sampling date"
}
if (meta_select == "newest") {
meta.string <- "most reccent sampling date"
}
if (meta_select == "count") {
meta.string <- "greatest number of results in TADA data frame"
}
# use org hierarchy for first round of metadata selection
if (isTRUE(org_hierarchy == "none")) {
# create string for flagging
org.string <- "Metadata were selected by "
# print message
print("TADA_FindNearbySites: No org_hierarchy supplied by user. Organization will not be taken into account during metadata selection.")
# create consistent org rank to facilitate meta data selection (all orgs ranked equally)
org.ranks <- as.data.frame(all.orgs) %>%
dplyr::mutate(OrgRank = 99) %>%
dplyr::rename(OrganizationIdentifier = all.orgs)
}
# if org hierarchy is supplied by user
if (org_hierarchy[1] != "none") {
# create string for flagging
org.string <- "Metadata were selected by filtering based on the user supplied hierarchy, then by "
if (!is.vector(org_hierarchy)) {
stop("TADA_FindNearbySites: Organization hierarchy must be supplied as a vector.")
}
if (length(org_hierarchy) == 0) {
stop("TADA_FindNearbySites: No organization identifiers were supplied.")
}
if (length(missing.orgs) > 0) {
print(paste0(
"TADA_FindNearbySites: ", length(missing.orgs),
" organization identifiers are missing from org_hierarchy (",
stringi::stri_replace_last(paste(missing.orgs, collapse = ", "),
fixed = ", ", " and "
), ").",
" Function will continue to run using partial org_hierarchy."
))
# create df for organization ranks from user-supplied hierarchy
org.ranks <- as.data.frame(org_hierarchy) %>%
dplyr::mutate(OrgRank = dplyr::row_number()) %>%
dplyr::rename(OrganizationIdentifier = org_hierarchy)
# create df for all organizations missing from user-supplied hierarchy
# all missing orgs will share the same rank and be ranked below any orgs supplied by user
missing.ranks <- as.data.frame(missing.orgs) %>%
dplyr::mutate(OrgRank = (length(org_hierarchy) + 1)) %>%
dplyr::rename(OrganizationIdentifier = missing.orgs)
# add missing orgs to org rank df
org.ranks <- org.ranks %>%
dplyr::bind_rows(missing.ranks)
}
if (length(missing.orgs) == 0) {
# create df for organization ranks from user-supplied hierarchy
org.ranks <- as.data.frame(org_hierarchy) %>%
dplyr::mutate(OrgRank = dplyr::row_number()) %>%
dplyr::rename(OrganizationIdentifier = org_hierarchy)
}
rm(all.orgs, missing.orgs)
}
# add org ranks to df of all TADA.MonitoringLocationIdentifier.New
org.ranks.added <- grouped.no.dates %>%
dplyr::left_join(org.ranks, by = dplyr::join_by(OrganizationIdentifier))
rm(org.ranks)
# filter to retain metadata for TADA.MonitoringLocation.New where there is only one set of
# metadata from the highest ranked org
org.meta.filter <- org.ranks.added %>%
dplyr::group_by(TADA.NearbySiteGroup, OrgRank) %>%
dplyr::mutate(CountSites = length(OrgRank)) %>%
dplyr::filter(CountSites == 1) %>%
dplyr::ungroup() %>%
dplyr::select(-OrgRank, -CountSites) %>%
dplyr::mutate(TADA.NearbySites.Flag = paste0(
"This monitoring location was grouped with other nearby site(s). ",
org.string, meta.string, "."
))
# select and assign metadata randomly for grouped sites when meta_select equals "random"
if (meta_select == "random") {
# select random metadata where necessary (no org rank or more than one set of metdata for one
# TADA.MonitoringLocationIdentifier.New)
random.meta <- org.ranks.added %>%
dplyr::ungroup() %>%
dplyr::filter(!TADA.NearbySiteGroup %in%
org.meta.filter$TADA.NearbySiteGroup) %>%
dplyr::group_by(TADA.NearbySiteGroup) %>%
dplyr::slice_min(OrgRank) %>%
dplyr::select(
TADA.MonitoringLocationIdentifier.New,
TADA.MonitoringLocationName,
TADA.LatitudeMeasure, TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName,
TADA.NearbySiteGroup
) %>%
dplyr::distinct() %>%
dplyr::slice_sample(n = 1) %>%
dplyr::ungroup()
# join the metadata filtering results to create a df with all metadat to apply to TADA df by
# TADA.MonitoringLocationIdentifier.New
select.meta <- random.meta %>%
dplyr::full_join(org.meta.filter, by = names(random.meta)) %>%
dplyr::select(-OrganizationIdentifier) %>%
dplyr::rename(
TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
) %>%
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected randomly.")
# remove intermediate objects
rm(random.meta, org.ranks.added)
}
if (meta_select == "oldest" | meta_select == "newest") {
# prep site groups for metadata selection by date
date.meta <- grouped.sites %>%
dplyr::left_join(org.ranks.added, by = dplyr::join_by(
TADA.MonitoringLocationIdentifier.New,
TADA.NearbySiteGroup,
TADA.MonitoringLocationName,
TADA.LatitudeMeasure,
TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName,
OrganizationIdentifier
)) %>%
dplyr::filter(!TADA.MonitoringLocationIdentifier.New %in%
org.meta.filter$TADA.MonitoringLocationIdentifier.New) %>%
dplyr::mutate(OrgRank = ifelse(is.na(OrgRank), rank.default, OrgRank)) %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier.New)
if (meta_select == "oldest") {
# select oldest metadata for group
date.meta <- date.meta %>%
dplyr::slice_min(ActivityStartDate)
# specify oldest for flagging string
date.choice <- "oldest"
}
if (meta_select == "newest") {
# select newest metadata for group
date.meta <- date.meta %>%
dplyr::slice_max(ActivityStartDate)
# specify newest for flagging string
date.choice <- "newest"
}
# select metadata by date
select.meta <- date.meta %>%
dplyr::full_join(org.meta.filter, by = dplyr::join_by(
TADA.MonitoringLocationIdentifier.New,
TADA.NearbySiteGroup,
TADA.MonitoringLocationName,
TADA.LatitudeMeasure,
TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName,
OrganizationIdentifier
)) %>%
dplyr::select(-OrganizationIdentifier, -OrgRank, -ActivityStartDate) %>%
dplyr::rename(
TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
) %>%
dplyr::group_by(TADA.NearbySiteGroup) %>%
dplyr::slice_sample(n = 1) %>%
dplyr::mutate(TADA.NearbySites.Flag = paste0(
"This monitoring location was grouped with other",
" nearby site(s). Metadata were selected from ",
"the ", date.choice, " result available."
))
rm(date.meta)
}
if (meta_select == "count") {
# select metadata by finding site with greatest number of results in TADA df
select.meta <- org.ranks.added %>%
dplyr::left_join(.data, by = dplyr::join_by(
TADA.MonitoringLocationName, TADA.LatitudeMeasure,
TADA.LongitudeMeasure, TADA.MonitoringLocationTypeName
)) %>%
dplyr::group_by(TADA.MonitoringLocationIdentifier) %>%
dplyr::mutate(NCount = length(TADA.ResultMeasureValue)) %>%
dplyr::ungroup() %>%
dplyr::select(-TADA.MonitoringLocationIdentifier) %>%
dplyr::distinct() %>%
dplyr::group_by(TADA.NearbySiteGroup) %>%
dplyr::slice_max(NCount) %>%
dplyr::slice_sample(n = 1) %>%
dplyr::select(
TADA.MonitoringLocationIdentifier.New, TADA.NearbySiteGroup,
TADA.MonitoringLocationName, TADA.LatitudeMeasure, TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName
) %>%
dplyr::rename(
TADA.MonitoringLocationName.New = TADA.MonitoringLocationName,
TADA.LatitudeMeasure.New = TADA.LatitudeMeasure,
TADA.LongitudeMeasure.New = TADA.LongitudeMeasure,
TADA.MonitoringLocationTypeName.New = TADA.MonitoringLocationTypeName
) %>%
dplyr::mutate(TADA.NearbySites.Flag = "This monitoring location was grouped with other nearby site(s). Metadata were selected from MonitoringLocation with the most results available across all characteristics.")
}
# remove intermediate objects
rm(grouped.no.dates, grouped.sites, org.meta.filter, org.string, meta.string)
# remove site group from ml.crosswalk
ml.crosswalk <- ml.crosswalk %>%
dplyr::select(-TADA.NearbySiteGroup) %>%
dplyr::distinct()
# join selected metadata to TADA df
.data <- .data %>%
dplyr::left_join(ml.crosswalk, by = dplyr::join_by(TADA.MonitoringLocationIdentifier)) %>%
dplyr::left_join(select.meta, by = dplyr::join_by(TADA.MonitoringLocationIdentifier.New)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
TADA.MonitoringLocationName = ifelse(!is.na(TADA.MonitoringLocationName.New),
TADA.MonitoringLocationName.New,
TADA.MonitoringLocationName
),
TADA.LatitudeMeasure = ifelse(!is.na(TADA.LatitudeMeasure.New),
TADA.LatitudeMeasure.New,
TADA.LatitudeMeasure
),
TADA.LongitudeMeasure = ifelse(!is.na(TADA.LongitudeMeasure.New),
TADA.LongitudeMeasure.New,
TADA.LongitudeMeasure
),
TADA.MonitoringLocationTypeName = ifelse(!is.na(TADA.MonitoringLocationTypeName.New),
TADA.MonitoringLocationTypeName.New,
TADA.MonitoringLocationTypeName
),
TADA.MonitoringLocationIdentifier = ifelse(!is.na(TADA.MonitoringLocationIdentifier.New),
TADA.MonitoringLocationIdentifier.New,
TADA.MonitoringLocationIdentifier
)
) %>%
dplyr::select(
-TADA.MonitoringLocationIdentifier.New, -TADA.MonitoringLocationName.New,
-TADA.LatitudeMeasure.New, -TADA.LongitudeMeasure.New,
-TADA.MonitoringLocationTypeName.New
) %>%
TADA_OrderCols()
# remove intermediate objects
rm(select.meta, ml.crosswalk)
# add flag for any ungrouped sites and order columns correctly
.data <- TADA_OrderCols(.data) %>%
dplyr::mutate(TADA.NearbySites.Flag = ifelse(is.na(TADA.NearbySites.Flag),
"No nearby sites detected using input buffer distance.",
TADA.NearbySites.Flag
))
# return TADA df with added columns for tracking
return(.data)
}
}
#' Get grouped monitoring stations that are near each other
#'
#' This function takes a TADA dataset that contains grouped nearby monitoring stations
#' and returns a unique dataset of the original MonitoringLocationIdentifier, the grouped
#' TADA.MonitoringLocationIdentifier, as well as the original and TADA-prefixed LongitudeMeasure,
#' LatitudeMeasure, MonitoringLocationName, and MonitoringLocationTypeName, filtered for only those
#' stations that have a nearby station.
#'
#' @param .data TADA dataframe
#'
#' @return New dataframe with unique combinations of original and TADA MonitoringLocationIdentifier,
#' LongitudeMeasure, LatitudeMeasure, MonitoringLocationName, and MonitoringLocationTypeName.
#'
#' @export
TADA_GetUniqueNearbySites <- function(.data) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# .data required columns
required_cols <- c(
"MonitoringLocationIdentifier", "TADA.MonitoringLocationIdentifier",
"MonitoringLocationName", "TADA.MonitoringLocationName",
"LongitudeMeasure", "TADA.LongitudeMeasure",
"LatitudeMeasure", "TADA.LatitudeMeasure",
"MonitoringLocationTypeName", "TADA.MonitoringLocationTypeName",
"MonitoringLocationDescriptionText", "TADA.NearbySites.Flag",
"TADA.NearbySiteGroup"
)
# check .data has required columns
TADA_CheckColumns(.data, required_cols)
# filter only for locations with nearby sites
.data <- .data %>%
dplyr::filter(
!is.na(TADA.NearbySites.Flag),
TADA.NearbySites.Flag != "No nearby sites detected using input buffer distance."
) %>%
# retain only required columns
dplyr::select(dplyr::all_of(required_cols)) %>%
# retain only unique records
dplyr::distinct()
return(.data)
}
#' Generate a random WQP dataset
#'
#' Retrieves data for a period of time in the past 20 years using
#' TADA_DataRetrieval. This function can be used for testing functions on
#' random datasets. Only random data sets with 10 or more results will be returned.
#' If a random dataset has fewer than 10 results, the function will automatically
#' create another random WQP query until a df with greater than 10 results is returned.
#'
#' @param number_of_days Numeric. The default is 1, which will query and retrieve
#' data for a random two-day period (e.g.startDate = "2015-04-21",
#' endDate = "2015-04-22"). The user can change this number to select additional days
#' if desired.
#'
#' @param choose_random_state Boolean (TRUE or FALSE). The default is FALSE.
#' If FALSE, the function will query all data in the WQP for the number_of_days
#' specified (national query). If TRUE, the function will select a random state
#' and only retrieve data for that state.
#'
#' @param autoclean Boolean (TRUE or FALSE). The default is TRUE.
#' If FALSE, the function will NOT apply the TADA_AutoClean as part of the
#' TADA_DataRetrieval. If TRUE, the function WILL apply TADA_AutoClean as part of
#' TADA_DataRetrieval.
#'
#' @return Random WQP dataset.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' df <- TADA_RandomTestingData(number_of_days = 1, choose_random_state = FALSE)
#' df <- TADA_RandomTestingData(number_of_days = 10, choose_random_state = TRUE)
#' df <- TADA_RandomTestingData(number_of_days = 5, choose_random_state = TRUE, autoclean = FALSE)
#' }
TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE,
autoclean = TRUE) {
get_random_data <- function(ndays = number_of_days, state_choice = choose_random_state,
ac = autoclean, ask = FALSE) {
# choose a random day within the last 20 years
twenty_yrs_ago <- Sys.Date() - 20 * 365
random_start_date <- twenty_yrs_ago + sample(20 * 365, 1)
# choose a random start date and add any number_of_days (set that as the end date)
end_date <- random_start_date + ndays
if (state_choice == TRUE) {
load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA"))
state <- sample(statecodes_df$STUSAB, 1)
}
if (state_choice == FALSE) {
state <- "null"
}
print(c(
startDate = as.character(random_start_date),
endDate = as.character(end_date),
statecode = state
))
if (ac == TRUE) {
dat <- TADA_DataRetrieval(
startDate = as.character(random_start_date),
endDate = as.character(end_date),
statecode = state,
applyautoclean = TRUE,
ask = FALSE
)
}
if (ac == FALSE) {
dat <- TADA_DataRetrieval(
startDate = as.character(random_start_date),
endDate = as.character(end_date),
statecode = state,
applyautoclean = FALSE,
ask = FALSE
)
}
return(dat)
}
verify_random_data <- function() {
df <- get_random_data()
while (nrow(df) < 10) {
df <- get_random_data()
}
return(df)
}
df <- verify_random_data()
return(df)
}
#' Aggregate multiple result values to a min, max, or mean
#'
#' This function groups TADA data by user-defined columns and aggregates the
#' TADA.ResultMeasureValue to a minimum, maximum, or average value.
#'
#' @param .data A TADA dataframe
#' @param grouping_cols The column names used to group the data
#' @param agg_fun The aggregation function used on the grouped data. This can
#' either be 'min', 'max', or 'mean'.
#' @param clean Boolean. Determines whether other measurements from the group
#' aggregation should be removed or kept in the dataframe. If clean = FALSE,
#' additional measurements are indicated in the
#' TADA.ResultValueAggregation.Flag as "Used in aggregation function but not
#' selected".
#'
#' @return A TADA dataframe with aggregated values combined into one row. If the
#' agg_fun is 'min' or 'max', the function will select the row matching the
#' aggregation condition and flag it as the selected measurement. If the
#' agg_fun is 'mean', the function will select a random row from the
#' aggregated rows to represent the metadata associated with the mean value,
#' and gives the row a unique ResultIdentifier: the original ResultIdentifier
#' with the prefix "TADA-". Function adds a TADA.ResultValueAggregation.Flag
#' to indicate which rows have been aggregated.
#'
#' @export
#'
#' @examples
#' # Load example dataset
#' data(Data_6Tribes_5y)
#' # Select maximum value per day, site, comparable data identifier, result detection condition,
#' # and activity type code. Clean all non-maximum measurements from grouped data.
#' Data_6Tribes_5y_agg <- TADA_AggregateMeasurements(Data_6Tribes_5y,
#' grouping_cols = c(
#' "ActivityStartDate", "TADA.MonitoringLocationIdentifier",
#' "TADA.ComparableDataIdentifier", "ResultDetectionConditionText",
#' "ActivityTypeCode"
#' ),
#' agg_fun = "max", clean = TRUE
#' )
#'
#' # Calculate a mean value per day, site, comparable data identifier, result detection condition,
#' # and activity type code. Keep all measurements used to calculate mean measurement.
#' Data_6Tribes_5y_agg <- TADA_AggregateMeasurements(Data_6Tribes_5y,
#' grouping_cols = c(
#' "ActivityStartDate", "TADA.MonitoringLocationIdentifier",
#' "TADA.ComparableDataIdentifier", "ResultDetectionConditionText",
#' "ActivityTypeCode"
#' ),
#' agg_fun = "mean", clean = FALSE
#' )
TADA_AggregateMeasurements <- function(.data, grouping_cols = c("ActivityStartDate", "TADA.MonitoringLocationIdentifier", "TADA.ComparableDataIdentifier", "ResultDetectionConditionText", "ActivityTypeCode"), agg_fun = c("max", "min", "mean"), clean = TRUE) {
TADA_CheckColumns(.data, grouping_cols)
agg_fun <- match.arg(agg_fun)
# Find multiple values in groups
ncount <- .data %>%
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
dplyr::summarise(ncount = length(ResultIdentifier))
if (max(ncount$ncount) < 2) {
print("No rows to aggregate.")
return(.data)
} else {
dat <- merge(.data, ncount, all.x = TRUE)
if (any(is.na(dat$TADA.ResultMeasureValue))) {
"Warning: your dataset contains one or more rows where TADA.ResultMeasureValue = NA. Recommend removing these rows before proceeding. Otherwise, the function will not consider NAs in its calculations."
}
dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ncount == 1, "No aggregation needed", paste0("Used in ", agg_fun, " aggregation function but not selected"))
multiples <- dat %>% dplyr::filter(ncount > 1)
dat <- dat %>% dplyr::select(-ncount)
if (agg_fun == "max") {
out <- multiples %>%
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
dplyr::slice_max(order_by = TADA.ResultMeasureValue, n = 1, with_ties = FALSE)
dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ResultIdentifier %in% out$ResultIdentifier, paste0("Selected as ", agg_fun, " aggregate value"), dat$TADA.ResultValueAggregation.Flag)
}
if (agg_fun == "min") {
out <- multiples %>%
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
dplyr::slice_min(order_by = TADA.ResultMeasureValue, n = 1, with_ties = FALSE)
dat$TADA.ResultValueAggregation.Flag <- ifelse(dat$ResultIdentifier %in% out$ResultIdentifier, paste0("Selected as ", agg_fun, " aggregate value"), dat$TADA.ResultValueAggregation.Flag)
}
if (agg_fun == "mean") {
out <- multiples %>%
dplyr::group_by(dplyr::across(dplyr::all_of(grouping_cols))) %>%
dplyr::mutate(TADA.ResultMeasureValue1 = mean(TADA.ResultMeasureValue, na.rm = TRUE)) %>%
dplyr::slice_sample(n = 1) %>%
dplyr::mutate(TADA.ResultValueAggregation.Flag = paste0("Selected as ", agg_fun, " aggregate value, with randomly selected metadata from a row in the aggregate group"))
out <- out %>%
dplyr::select(-TADA.ResultMeasureValue) %>%
dplyr::rename(TADA.ResultMeasureValue = TADA.ResultMeasureValue1) %>%
dplyr::mutate(ResultIdentifier = paste0("TADA-", ResultIdentifier))
dat <- plyr::rbind.fill(dat, out)
}
if (clean == TRUE) {
dat <- subset(dat, !dat$TADA.ResultValueAggregation.Flag %in% c(paste0("Used in ", agg_fun, " aggregation function but not selected")))
}
dat <- TADA_OrderCols(dat)
print("Aggregation results:")
print(table(dat$TADA.ResultValueAggregation.Flag))
return(dat)
}
}
#' Get bounding box JSON
#'
#' @param bbox A bounding box from the sf function st_bbox
#' @return A string containing bounding box JSON that can be passed to an
#' ArcGIS feature layer in the Input Geometry field
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_6Tribes_5y)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#' c(
#' xmin = min(Data_6Tribes_5y$TADA.LongitudeMeasure),
#' ymin = min(Data_6Tribes_5y$TADA.LatitudeMeasure),
#' xmax = max(Data_6Tribes_5y$TADA.LongitudeMeasure),
#' ymax = max(Data_6Tribes_5y$TADA.LatitudeMeasure)
#' ),
#' crs = sf::st_crs(Data_6Tribes_5y)
#' )
#' # Get a string containing the JSON of the bounding box
#' getBboxJson(bbox)
#' }
getBboxJson <- function(bbox) {
json <- paste0('{"xmin":', bbox[1], ',"ymin":', bbox[2], ',"xmax":', bbox[3], ',"ymax":', bbox[4], "}")
return(json)
}
#' Create icon(s) to be used to represent points on a map feature layer
#' pchIcons is used within TADA_addPoints
#'
#' Uses the different plotting symbols available in R to create PNG files that can be used as markers on a map feature layer.
#'
#' @param pch Plot character code; either a single number or a vector of multiple numbers. Possible values available at https://www.geeksforgeeks.org/r-plot-pch-symbols-different-point-shapes-available-in-r/. Defaults to 1 (an open circle).
#' @param width Width of the plot character. Defaults to 30 pixels.
#' @param height Height of the plot character. Defaults to 30 pixels.
#' @param bg Background color of the plot character Defaults to transparent.
#' @param col Color(s) of the plot character(s). Defaults to black.
#' @param lwd Line width. Optional, defaults to NULL.
#' @return Path(s) to PNG file(s) in a temp folder on user's computer.
#'
#' @examples
#' \dontrun{
#' # Create three PNG files, a red circle, blue triangle, and yellow "X", each on a green background.
#' pchIcons(c(1, 2, 4), 40, 40, "green", c("red", "blue", "yellow"))
#' }
pchIcons <- function(pch = 1,
width = 30,
height = 30,
bg = "transparent",
col = "black",
lwd = NULL) {
n <- length(pch)
files <- character(n)
for (i in seq_len(n)) {
f <- tempfile(fileext = ".png")
grDevices::png(f,
width = width,
height = height,
bg = bg
)
graphics::par(mar = c(0, 0, 0, 0))
graphics::plot.new()
graphics::points(
.5,
.5,
pch = pch[i],
col = col[i],
cex = min(width, height) / 8,
lwd = lwd
)
grDevices::dev.off()
files[i] <- f
}
files
}
#' Retrieve feature layer from ArcGIS REST service
#' getFeatureLayer is used by writeLayer to write feature layers to local files
#'
#' @param url URL of the layer REST service, ending with "/query". Example: https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query (American Indian Reservations)
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return ArcGIS feature layer
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_Nutrients_UT)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#' c(
#' xmin = min(Data_Nutrients_UT$TADA.LongitudeMeasure),
#' ymin = min(Data_Nutrients_UT$TADA.LatitudeMeasure),
#' xmax = max(Data_Nutrients_UT$TADA.LongitudeMeasure),
#' ymax = max(Data_Nutrients_UT$TADA.LatitudeMeasure)
#' ),
#' crs = sf::st_crs(Data_Nutrients_UT)
#' )
#' # Get the American Indian Reservations feature layer,
#' # filtered by the bounding box for the Data_Nutrients_UT example dataset
#' getFeatureLayer("https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query", bbox)
#' }
getFeatureLayer <- function(url, bbox = NULL) {
if (is.null(bbox)) {
inputGeom <- NULL
} else {
inputGeom <- getBboxJson(bbox)
}
url <- paste0(url, "?where=1%3D1&outfields=*&returnGeometry=true&geometry=", inputGeom, "&f=geojson")
layer <- sf::read_sf(url)
return(layer)
}
#' Download a shapefile from an API and save it to a local folder, overwriting existing file if it exists
#' writeLayer is used by TADA_UpdateTribalLayers in TADAGeospatialRefLayers.R.
#'
#' @param url URL of the layer REST service, ending with "/query". Example: https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/2/query (American Indian Reservations)
#' @param layerfilepath Local path to save the .shp file to
#'
#' @examples
#' \dontrun{
#' # Get the Oklahoma Tribal Statistical Areas feature layer and write
#' # local file to inst/extdata/OKTribe.shp
#' OKTribeUrl <- "https://geopub.epa.gov/arcgis/rest/services/EMEF/Tribal/MapServer/4/query"
#' writeLayer(OKTribeUrl, "inst/extdata/OKTribe.shp")
#' }
writeLayer <- function(url, layerfilepath) {
layer <- getFeatureLayer(url)
# Attribute names can only be up to 10 characters long when saved to .dbf as part of sf::st_write.
# They are truncated automatically but TOTALAREA_MI and TOTALAREA_KM will not be unique after being
# truncated, so explicitly rename them first if they exist to avoid error.
if ("TOTALAREA_MI" %in% colnames(layer)) {
layer <- layer %>% dplyr::rename(
TAREA_MI = TOTALAREA_MI,
TAREA_KM = TOTALAREA_KM
)
}
sf::st_write(layer, layerfilepath, delete_layer = TRUE)
}
#' Get a shapefile from a local folder, optionally crop it by a bounding box, and return it as a sf object
#' getLayer is used within TADA_addPolys and TADA_addPoints
#'
#' @param layerfilepath Local path to the .shp file for the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return sf object containing the layer
#'
#'
#' @examples
#' \dontrun{
#' # Load example dataset
#' data(Data_6Tribes_5y_Harmonized)
#' # Get the bounding box of the data
#' bbox <- sf::st_bbox(
#' c(
#' xmin = min(Data_6Tribes_5y_Harmonized$TADA.LongitudeMeasure),
#' ymin = min(Data_6Tribes_5y_Harmonized$TADA.LatitudeMeasure),
#' xmax = max(Data_6Tribes_5y_Harmonized$TADA.LongitudeMeasure),
#' ymax = max(Data_6Tribes_5y_Harmonized$TADA.LatitudeMeasure)
#' ),
#' crs = sf::st_crs(Data_6Tribes_5y_Harmonized)
#' )
#' # Get the American Indian Reservations feature layer,
#' # filtered by the bounding box for the Data_6Tribes_5y_Harmonized
#' # example dataset
#' layerfilepath <- "extdata/AmericanIndian.shp"
#' getLayer(layerfilepath, bbox)
#' }
getLayer <- function(layerfilepath, bbox = NULL) {
layer <- sf::st_read(system.file(layerfilepath, package = "EPATADA"))
if (!(is.null(bbox))) {
sf::sf_use_s2(FALSE)
layer <- sf::st_make_valid(layer)
layer <- sf::st_crop(layer, bbox)
}
return(layer)
}
#' Get text for tribal marker popup
#' getPopup is used within TADA_addPolys and TADA_addPoints
#'
#' @param layer A map feature layer
#' @param layername Name of the layer
#' @return Vector of strings to be used as the text for the popups when clicking on a tribal marker
#'
#' @examples
#' \dontrun{
#' # Get the Oklahoma Tribal Statistical Areas feature layer
#' layer <- getLayer("extdata/OKTribe.shp")
#' # Get popup text for individual markers
#' getPopup(layer, "Oklahoma Tribal Statistical Areas")
#' }
getPopup <- function(layer, layername) {
text <- paste0("<strong>", layername, "</strong><p>")
cols <-
c(
"TRIBE_NAME" = "Tribe Name",
"PARCEL_NO" = "Parcel Number",
"EPA_ID" = "EPA ID",
"TYPE" = "Type"
)
for (i in seq(1, length(cols))) {
if (names(cols[i]) %in% colnames(layer)) {
text <- paste0(text, "<strong>", cols[i], "</strong>: ", layer[[names(cols[i])]], "<br>")
}
}
return(text)
}
#' Add polygons from an ArcGIS feature layer to a leaflet map
#'
#' @param map A leaflet map
#' @param layerfilepath Local path to the .shp file for the layer
#' @param layergroup Name of the layer group
#' @param layername Name of the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return The original map with polygons from the feature layer added to it.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create a leaflet map
#' lmap <- leaflet::leaflet() %>%
#' leaflet::addProviderTiles("Esri.WorldTopoMap", group = "World topo") %>%
#' leaflet::addMapPane("featurelayers", zIndex = 300)
#' # Add the American Indian Reservations feature layer to the map
#' lmap <- TADA_addPolys(lmap, "extdata/AmericanIndian.shp", "Tribes", "American Indian Reservations")
#' lmap
#' }
TADA_addPolys <- function(map, layerfilepath, layergroup, layername, bbox = NULL) {
layer <- getLayer(layerfilepath, bbox)
if (is.null(layer)) {
return(map)
}
lbbox <- sf::st_bbox(layer)
if (is.na(lbbox[1])) {
return(map)
}
areaColumn <- "ALAND_KM"
if (!(areaColumn %in% colnames(layer))) {
areaColumn <- "AREA_KM"
}
map <-
leaflet::addPolygons(
map,
data = layer,
color = "#A0522D",
weight = 0.35,
smoothFactor = 0.5,
opacity = 1.0,
fillOpacity = 0.2,
fillColor = ~ leaflet::colorNumeric("Oranges", layer[[areaColumn]])(layer[[areaColumn]]),
highlightOptions = leaflet::highlightOptions(
color = "white",
weight = 2,
bringToFront = TRUE
),
popup = getPopup(layer, layername),
group = layergroup,
options = leaflet::pathOptions(pane = "featurelayers")
)
return(map)
}
#' Add points from an ArcGIS feature layer to a leaflet map
#'
#' @param map A leaflet map
#' @param layerfilepath Local path to the .shp file for the layer
#' @param layergroup Name of the layer group
#' @param layername Name of the layer
#' @param bbox A bounding box from the sf function st_bbox; used to filter the query results. Optional; defaults to NULL.
#' @return The original map with polygon from the feature layer added to it.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Create a leaflet map
#' lmap <- leaflet::leaflet() %>%
#' leaflet::addProviderTiles("Esri.WorldTopoMap", group = "World topo") %>%
#' leaflet::addMapPane("featurelayers", zIndex = 300)
#' # Add the Virginia Federally Recognized Tribes feature layer to the map
#' lmap <- TADA_addPoints(
#' lmap, "extdata/VATribe.shp",
#' "Tribes", "Virginia Federally Recognized Tribes"
#' )
#' lmap
#' }
TADA_addPoints <- function(map, layerfilepath, layergroup, layername, bbox = NULL) {
layer <- getLayer(layerfilepath, bbox)
if (is.null(layer)) {
return(map)
}
lbbox <- sf::st_bbox(layer)
if (is.na(lbbox[1])) {
return(map)
}
shapes <- c(2) # open triangle; for other options see https://www.geeksforgeeks.org/r-plot-pch-symbols-different-point-shapes-available-in-r/
iconFiles <- pchIcons(shapes, width = 20, height = 20, col = c("#CC7722"), lwd = 2)
map <- leaflet::addMarkers(
map,
data = layer,
icon = ~ leaflet::icons(
iconUrl = iconFiles[],
popupAnchorX = 20,
popupAnchorY = 0
),
popup = getPopup(layer, layername),
group = layergroup,
options = leaflet::pathOptions(pane = "featurelayers")
)
return(map)
}
#' Create Characteristic/MeasureUnitCode/MethodSpeciation Ref
#'
#' Creates data frame of unique combinations of TADA.CharacteristicName,
#' TADA.ResultMeasure.MeasureUnitCode, ResultMeasure.MeasureUnitCode, and
#' TADA.MethodSpeciationName in a TADA data frame.
#'
#' @param .data A TADA data frame.
#'
#' @return A data frame with unique combinations of TADA.CharacteristicName,
#' TADA.ResultMeasure.MeasureUnitCode, ResultMeasure.MeasureUnitCode, and
#' TADA.MethodSpeciationName
#'
#' @export
#'
#' @examples
#' UniqueCharUnitSpecExample <-
#' TADA_UniqueCharUnitSpeciation(Data_NCTCShepherdstown_HUC12)
TADA_UniqueCharUnitSpeciation <- function(.data) {
required_cols <- c(
"TADA.CharacteristicName", "TADA.ResultSampleFractionText",
"TADA.MethodSpeciationName", "TADA.ResultMeasure.MeasureUnitCode",
"TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode"
)
# Check to see if TADA_Autoclean has been run
if (any(required_cols %in% colnames(.data)) == FALSE) {
print("The dataframe does not contain the required fields. Running TADA_AutoClean to create required columns.")
.data <- TADA_AutoClean(.data)
}
if (all(required_cols %in% colnames(.data)) == TRUE) {
.data <- .data
}
# Create df of unique codes and characteristic names(from TADA.CharacteristicName and TADA.ResultMeasure.MeasureUnitCode) in TADA data frame
data.units.result <- .data %>%
dplyr::select(
TADA.CharacteristicName, TADA.ResultMeasure.MeasureUnitCode,
ResultMeasure.MeasureUnitCode, TADA.MethodSpeciationName
) %>%
dplyr::filter(!is.na(TADA.ResultMeasure.MeasureUnitCode)) %>%
dplyr::distinct()
# Create df of unique codes and characteristic names(from TADA.CharacteristicName and TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) in TADA data frame
data.units.det <- .data %>%
dplyr::select(
TADA.CharacteristicName, TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
DetectionQuantitationLimitMeasure.MeasureUnitCode, TADA.MethodSpeciationName
) %>%
dplyr::filter(!is.na(TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)) %>%
dplyr::distinct() %>%
dplyr::rename(
TADA.ResultMeasure.MeasureUnitCode = TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
ResultMeasure.MeasureUnitCode = DetectionQuantitationLimitMeasure.MeasureUnitCode
)
# Create combined df with all unique codes (both result and det units) and characteristic names
data.units <- data.units.result %>%
dplyr::full_join(data.units.det, by = c(
"TADA.CharacteristicName", "TADA.ResultMeasure.MeasureUnitCode",
"ResultMeasure.MeasureUnitCode", "TADA.MethodSpeciationName"
)) %>%
dplyr::distinct() %>%
dplyr::group_by(TADA.CharacteristicName) %>%
dplyr::mutate(NCode = length(unique(TADA.ResultMeasure.MeasureUnitCode))) %>%
dplyr::filter(!is.na(TADA.ResultMeasure.MeasureUnitCode) |
is.na(TADA.ResultMeasure.MeasureUnitCode) & NCode == 1) %>%
dplyr::select(-NCode)
return(data.units)
}
#' Create Color Palette For Use in Graphs and Maps
#'
#' Creates a consistent color palette for use in TADA visualizations. Consistent
#' color pairings can be utilized by setting col_pair = TRUE, in which each row
#' consists of two values for color outlines and fills. Currently, the palette
#' is utilizing the "Okabe-Ito" palette from base R via the palette.colors
#' function. The palette includes 9 colors by default. However, additional colors
#' can be added to the palette as needed as more complex visualization functions
#' are added to the TADA package.
#'
#' @param col_pair Boolean argument. Optional argument to define consistent color
#' pairings for outlines/fills of TADA figures defined by the row values in a dataframe.
#'
#' @return A color palette based on the "Okabe-Ito" palette, extended to 15 colors,
#' with modifications for use in mapping and graphing functions
#'
#' @export
#'
#' @examples
#' TestColorPalette <- TADA_ColorPalette()
#' TestColorPalettePairings <- TADA_ColorPalette(col_pair = TRUE)
#' TestColorPalettePairings
TADA_ColorPalette <- function(col_pair = FALSE) {
pal <- c(
"#000000", "#835A00", "#DC851E", "#059FA4", "#56B4E9",
"#005258", "#A1A522", "#F0E442", "#66A281", "#1E6F98",
"#4F5900", "#813B00", "#CD758F", "#B686A1", "#999999"
)
# Defines two color columns to be used as the color pairings in a dataframe
col1 <- c()
col2 <- c()
col_combo <- data.frame()
# Each row defines the pairing of colors to be used if col_pair is TRUE
if (col_pair == TRUE) {
col1 <- c(pal[5], pal[3], pal[7], pal[14])
col2 <- c(pal[10], pal[12], pal[11], pal[2])
col_combo <- data.frame(col1, col2)
pal <- col_combo
}
return(pal)
}
#' View TADA Color Palette
#'
#' View a swatch of the colors in the TADA Color palette labeled by color and
#' index number. TADA developers can reference this function when deciding which
#' colors to use in TADA visualizations. TADA users can also reference this
#' palette function to create their own visually consistent figures. TADA consistent
#' color pairings when col_pair = TRUE can be viewed in a matrix format.
#'
#' @param col_pair Boolean argument. Optional argument to view consistent color
#' pairings for outlines/fills of TADA figures defined by the row values in a dataframe.
#'
#' @return A color swatch figure based on the TADA color palette.
#'
#' @export
#'
#' @examples
#' TestViewPalette <- TADA_ViewColorPalette()
#' TestViewPalettePairing <- TADA_ViewColorPalette(col_pair = TRUE)
TADA_ViewColorPalette <- function(col_pair = FALSE) {
# call TADA color palette
pal <- TADA_ColorPalette()
# determine length of color palette
n <- length(pal)
# create list of label colors, first one needs to be white to show up clearly
label_colors <- rep("black", n)
label_colors[1] <- "white"
# create color swatch graphic
graphics::par(mar = c(1, 0, 1, 0))
swatch <- graphics::plot(1,
type = "n", xlab = "", ylab = "", xlim = c(0.5, n + 0.5), ylim = c(0, 1),
main = "TADA Palette", axes = FALSE
)
rect(1:n - 0.5, 0, n + 0.5, 1, col = pal, border = NA)
text(x = 1:n, y = 0.5, labels = 1:n, pos = 3, col = label_colors)
text(x = 1:n, y = 0.5 - 0.2, labels = pal, pos = 1, col = label_colors, cex = 0.7, srt = 90)
col_combo <- TADA_ColorPalette(col_pair = TRUE)
if (col_pair == TRUE) {
swatch <- list()
# Create a 2 x nrow/2 plotting matrix, can handle additional color pairings, in one view, if more are added in the future.
graphics::par(mfrow = c(2, nrow(col_combo) / 2))
# create list of label colors for pairs
label_colors <- rep("black", 2)
for (i in 1:nrow(col_combo)) {
one_swatch <- graphics::plot(1,
type = "n", xlab = "", ylab = "", xlim = c(0.5, 2.5), ylim = c(0, 1),
main = paste0("TADA Palette Pair ", i), axes = FALSE
)
rect(1:2 - 0.5, 0, 2 + 0.5, 1, col = as.character(col_combo[i, ]), border = NA)
# text(x = 1:2, y = 0.5 - 0.2, labels = 1:2, pos = 3, col = label_colors, cex = 0.75)
text(x = 1:2 + 0.25, y = 0.5, labels = col_combo[i, ], pos = 2, col = label_colors, cex = 0.7)
swatch[[i]] <- one_swatch
}
}
graphics::par(mfrow = c(1, 1))
swatch <- grDevices::recordPlot()
return(swatch)
}
#' Remove NAs in Strings for Figure Titles and Axis Labels
#'
#' Returns a vector of string(s) that removes common NA strings
#' found in columns such as TADA.ComparableDataIdentifier. Can also
#' accommodate handling of certain NA texts found in any general
#' character string or a vector of strings.
#'
#' This function is meant as an internal function to remove NAs
#' from figure titles and axis labels for the TADA package.
#'
#' @param char_string Character argument. Could be a single string
#' or vector of strings that contains common "NA" strings
#' (ex: "(NA", "(NA)", "_NA", etc.)
#'
#' @return A vector string that has removed NAs from its value.
#'
#' @export
#'
#' @examples
#' # Removes NAs based on each TADA.ComparableDataIdentifier found in a dataset.
#' data(Data_Nutrients_UT)
#' UT_Titles <- TADA_CharStringRemoveNA(unique(Data_Nutrients_UT$TADA.ComparableDataIdentifier))
TADA_CharStringRemoveNA <- function(char_string) {
# Checks if data type is a character string.
if (!is.character(char_string)) {
stop(paste0("TADA_CharStrignRemoveNA: 'char_string' argument is not a character string."))
}
# Converts character string to a vector.
title_string <- as.vector(char_string)
# Looks through each item in the vector and removes NAs from each.
labs <- c()
for (i in 1:length(char_string)) {
labs[i] <- paste0(char_string[i], collapse = " ")
labs[i] <- gsub("_NA|\\(NA|\\(NA)", "", labs[i])
labs[i] <- gsub("_", " ", labs[i])
labs <- as.vector(labs)
}
return(labs)
}
#' Create downloadable table
#'
#' This function creates a data table that can be downloaded as a .csv, .xlsx or .pdf.
#'
#' @param .data A data frame
#'
#' @return A data table with multiple download options (.csv, .xlsx or .pdf).
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # return ATTAINS parameter domain values
#' TADA_TableExport(rATTAINS::domain_values(domain_name = "ParameterName"))
#' }
TADA_TableExport <- function(.data = NULL) {
if (is.null(.data)) {
print("No dataframe provided. Please enter a dataframe to return")
}
data <- DT::datatable(.data,
extensions = c("Buttons", "FixedColumns"),
options = list(
paging = TRUE,
dom = "Bfrtip",
autoWidth = TRUE,
pageLength = 5,
scrollX = TRUE,
scrollCollapse = TRUE,
buttons = c("copy", "csv", "excel", "pdf")
# fixedColumns = list(leftColumns = 1
), class = "display"
) %>%
DT::formatStyle(columns = colnames(.data), "fontSize" = "80%")
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.