#' Check for Suspect Analytical Methods
#'
#' Function checks the validity of each characteristic-analytical method
#' combination in the dataframe. When clean = TRUE, rows with Suspect
#' characteristic-analytical method combinations are removed. Default is
#' clean = FALSE. When flaggedonly = TRUE, dataframe is filtered to show only
#' Suspect characteristic-analytical method combinations. Default is
#' flaggedonly = FALSE.
#'
#' The “Not Reviewed” value within "TADA.AnalyticalMethod.Flag" means
#' that the EPA WQX team has not yet reviewed the combinations
#' (see https://cdx.epa.gov/wqx/download/DomainValues/QAQCCharacteristicValidation.CSV).
#' The WQX team plans to review and update these new combinations quarterly.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument; removes "Suspect" characteristic-analytical
#' method combinations from the dataframe when clean = TRUE. Default is
#' clean = FALSE.
#' @param flaggedonly Boolean argument; filters dataframe to show only "Suspect"
#' characteristic-analytical method combinations when flaggedonly = TRUE. Default
#' is flaggedonly = FALSE.
#'
#' @return This function adds the TADA.AnalyticalMethod.Flag to a TADA dataframe. This column
#' flags Suspect CharacteristicName, ResultAnalyticalMethod/MethodIdentifier,
#' and ResultAnalyticalMethod/MethodIdentifierContext combinations in your dataframe
#' as either "Not Reviewed", "Suspect", or "Pass". When clean = FALSE and
#' flaggedonly = TRUE, the dataframe is filtered to show only "Suspect"
#' characteristic-analytical method combinations; the column TADA.AnalyticalMethod.Flag
#' is still appended. When clean = TRUE and flaggedonly = FALSE, "Suspect" rows
#' are removed from the dataframe and no column will be appended.
#'
#' @export
#'
#' @examples
#' # Load example dataset
#' data(Data_NCTCShepherdstown_HUC12)
#'
#' # Remove Suspect characteristic-analytical method combinations from
#' # dataframe:
#' SuspectMethod_clean <- TADA_FlagMethod(Data_NCTCShepherdstown_HUC12,
#' clean = TRUE
#' )
#'
#' # Flag, but do not remove, Suspect characteristic-analytical method
#' # combinations
#' # in new column titled "TADA.AnalyticalMethod.Flag":
#' SuspectMethod_flags <- TADA_FlagMethod(Data_NCTCShepherdstown_HUC12,
#' clean = FALSE
#' )
#'
#' # Show only Suspect characteristic-analytical method combinations:
#' SuspectMethod_flaggedonly <- TADA_FlagMethod(Data_NCTCShepherdstown_HUC12,
#' clean = FALSE, flaggedonly = TRUE
#' )
#'
TADA_FlagMethod <- function(.data, clean = FALSE, flaggedonly = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check flaggedonly is boolean
TADA_CheckType(flaggedonly, "logical")
# check .data has required columns
required_cols <- c(
"TADA.CharacteristicName", "ResultAnalyticalMethod.MethodIdentifier",
"ResultAnalyticalMethod.MethodIdentifierContext"
)
TADA_CheckColumns(.data, required_cols)
# check that clean and flaggedonly are not both TRUE
if (clean == TRUE & flaggedonly == TRUE) {
stop("Function not executed because clean and flaggedonly cannot both be TRUE")
}
# execute function after checks are passed - removes flag column in case reference table has changed.
# delete existing flag column
if (("TADA.AnalyticalMethod.Flag" %in% colnames(.data)) == TRUE) {
.data <- dplyr::select(.data, -TADA.AnalyticalMethod.Flag)
}
# read in WQX val reference table and filter
meth.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "EPATADA")) %>%
dplyr::filter(Type == "CharacteristicMethod")
# join "TADA.WQXVal.Flag" column to .data by CharacteristicName, Source (Media), and Value (unit)
check.data <- merge(.data, meth.ref[, c("Characteristic", "Source", "Value", "TADA.WQXVal.Flag")],
by.x = c(
"TADA.CharacteristicName", "ResultAnalyticalMethod.MethodIdentifier",
"ResultAnalyticalMethod.MethodIdentifierContext"
),
by.y = c("Characteristic", "Value", "Source"), all.x = TRUE
)
# rename TADA.WQXVal.Flag column to WQX.AnalyticalMethodValidity
check.data <- check.data %>%
dplyr::rename(TADA.AnalyticalMethod.Flag = TADA.WQXVal.Flag) %>%
dplyr::distinct()
# rename NA values to NonStandardized in WQX.AnalyticalMethodValidity column
check.data["TADA.AnalyticalMethod.Flag"][is.na(check.data["TADA.AnalyticalMethod.Flag"])] <- "Not Reviewed"
if (flaggedonly == FALSE) {
# if all rows are "Pass" or NA "Not Reviewed", return input unchanged
## note: Cristina edited this on 9/19/22 to keep Not Reviewed/NA data when clean = TRUE. Now only Suspect data is removed.
if (any("Suspect" %in%
unique(check.data$TADA.AnalyticalMethod.Flag)) == FALSE) {
print("No Suspect method/characteristic combinations in your dataframe. Returning the input dataframe with TADA.AnalyticalMethod.Flag column for tracking.")
check.data <- TADA_OrderCols(check.data)
return(check.data)
}
# flagged output, all data
if (clean == FALSE) {
check.data <- TADA_OrderCols(check.data)
return(check.data)
}
# clean output
if (clean == TRUE) {
# filter out Suspect characteristic-unit-method combinations
clean.data <- dplyr::filter(check.data, TADA.AnalyticalMethod.Flag != "Suspect")
# remove WQX.AnalyticalMethodValidity column
# clean.data <- dplyr::select(clean.data, -TADA.AnalyticalMethod.Flag)
clean.data <- TADA_OrderCols(clean.data)
return(clean.data)
}
}
# flagged output, errors only
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only Suspect characteristic-unit-method combinations
Suspect.data <- dplyr::filter(check.data, TADA.AnalyticalMethod.Flag == "Suspect")
if (nrow(Suspect.data) == 0) {
# Suspect.data <- dplyr::select(Suspect.data, -TADA.AnalyticalMethod.Flag)
print("This dataframe is empty because we did not find any Suspect method/characteristic combinations in your dataframe")
}
Suspect.data <- TADA_OrderCols(Suspect.data)
return(Suspect.data)
}
}
#' Flag Continuous Data
#'
#' Continuous data may (or may not) be suitable for integration with discrete
#' water quality data for analyses. Therefore, this function uses metadata
#' submitted by data providers to flag rows with continuous data.
#'
#' Continuous data is often aggregated to a daily avg, max, and min value,
#' or another statistic of interest to the data submitter. Alternatively, some
#' organizations aggregate their high frequency data (15 min or 1 hour data)
#' to 2 or 4 hour interval averages. In all of these scenarios, the data provider
#' may have also included the raw data (full continuous time series) as a text file
#' attachment at the activity level.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument: When clean = FALSE (default), a column titled
#' "TADA.ContinuousData.Flag" is added to the dataframe to indicate if
#' each row includes "Continuous" or "Discrete" data. When clean = TRUE, rows
#' with "Continuous" data are removed from the dataframe and no column is appended.
#' @param flaggedonly Boolean argument: When flaggedonly = FALSE (default), all
#' results are included in the output. When flaggedonly = TRUE, the dataframe
#' will be filtered to include only the rows flagged as "Continuous" results.
#' @param time_difference Numeric argument defining the maximum time difference
#' in hours between measurements of the same TADA.ComparableDataIdentifier taken at the same
#' latitude, longitude, and depth. This is used to search for
#' continuous time series data (i.e., if there are multiple measurements within the selected
#' time_difference, then the row will be flagged as continuous). The default time window is 4 hours.
#' The time_difference can be adjusted by the user.
#' @return The default is clean = FALSE and flaggedonly = FALSE.
#' When clean = FALSE and flaggedonly = FALSE (default), a new column,
#' "TADA.ContinuousData.Flag", is appended to the input data set which
#' flags each row as "Continuous" or "Discrete".
#' When clean = FALSE and flaggedonly = TRUE, the dataframe is filtered to show
#' only the flagged continuous data and the flag column is still appended.
#' When clean = TRUE and flaggedonly = FALSE, continuous data is
#' removed from the dataframe and no column is appended.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' all_data <- TADA_DataRetrieval(project = c(
#' "Continuous LC1",
#' "MA_Continuous", "Anchorage Bacteria 20-21"
#' ), ask = FALSE)
#'
#' # Flag continuous data in new column titled "TADA.ContinuousData.Flag"
#' all_data_flags <- TADA_FlagContinuousData(all_data, clean = FALSE)
#'
#' # Show only rows flagged as continuous data (note that all results are
#' # flagged in the example)
#' all_data_flaggedonly <- TADA_FlagContinuousData(all_data,
#' clean = FALSE, flaggedonly = TRUE
#' )
#'
#' # Remove continuous data in dataframe (note that this dataframe will
#' # have 0 results because all are flagged in the example)
#' all_data_clean <- TADA_FlagContinuousData(all_data, clean = TRUE)
#'
#' data(Data_Nutrients_UT)
#'
#' # Flag continuous data in new column titled "TADA.ContinuousData.Flag"
#' Data_Nutrients_UT_flags <- TADA_FlagContinuousData(Data_Nutrients_UT,
#' clean = FALSE
#' )
#' unique(Data_Nutrients_UT_flags$TADA.ContinuousData.Flag)
#'
#' # Show only rows flagged as continuous data
#' Data_Nutrients_UT_flaggedonly <- TADA_FlagContinuousData(Data_Nutrients_UT,
#' clean = FALSE, flaggedonly = TRUE
#' )
#'
#' # Remove continuous data in dataframe
#' Data_Nutrients_UT_clean <- TADA_FlagContinuousData(Data_Nutrients_UT,
#' clean = TRUE
#' )
#' unique(Data_Nutrients_UT_clean$TADA.ContinuousData.Flag)
#' }
#'
TADA_FlagContinuousData <- function(.data, clean = FALSE, flaggedonly = FALSE, time_difference = 4) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check flaggedonly is boolean
TADA_CheckType(flaggedonly, "logical")
# check .data has required columns
TADA_CheckColumns(.data, c(
"ActivityTypeCode",
"SampleCollectionEquipmentName",
"ResultDetectionConditionText",
"ResultTimeBasisText",
"StatisticalBaseCode",
"ResultValueTypeName",
"ResultIdentifier",
"OrganizationIdentifier",
"ActivityRelativeDepthName"
))
# check that clean and flaggedonly are not both TRUE
if (clean == TRUE & flaggedonly == TRUE) {
stop("Function not executed because clean and flaggedonly cannot both be TRUE")
}
# run autoclean if it has not already been run
if ("TADA.ActivityMediaName" %in% colnames(.data)) {
.data <- .data
} else {
# run autoclean
.data <- TADA_AutoClean(.data)
}
if ("TADA.LatitudeMeasure" %in% colnames(.data)) {
.data <- .data
} else {
# run autoclean
.data <- TADA_AutoClean(.data)
}
if ("TADA.LongitudeMeasure" %in% colnames(.data)) {
.data <- .data
} else {
# run autoclean
.data <- TADA_AutoClean(.data)
}
# run TADA_FindQCActivities if it has not already been run
if ("TADA.ActivityType.Flag" %in% colnames(.data)) {
.data <- .data
} else {
# run TADA_FindQCActivities
.data <- TADA_FindQCActivities(.data)
}
# run TADA_CreateComparableID if it has not already been run
if ("TADA.ComparableDataIdentifier" %in% colnames(.data)) {
.data <- .data
} else {
# run TADA_CreateComparableID
.data <- TADA_CreateComparableID(.data)
}
# execute function after checks are passed: flag continuous data and make cont.data dataframe
# set default flag to "Discrete"
.data$TADA.ContinuousData.Flag <- "Discrete"
# once new 3.0 profiles come out, check for zip files in ActivityFileURL and flag data that populates the DataLoggerLine
cont.data <- .data %>%
dplyr::filter(TADA.ActivityType.Flag == "Non_QC") %>%
dplyr::filter(ActivityTypeCode == "Field Msr/Obs-Continuous Time Series" | # ID cont data with new activity type code from 2023
grepl("Continuous", ProjectIdentifier) | # ID cont data by looking for string in project ID
grepl("CONTINUOUS", ProjectIdentifier) | # ID cont data by looking for string in project ID
(ActivityTypeCode == "Sample-Integrated Time Series" & SampleCollectionEquipmentName == "Probe/Sensor") |
(ActivityTypeCode == "Field Msr/Obs-Portable Data Logger" & !is.na(ResultTimeBasisText)) |
(ActivityTypeCode == "Field Msr/Obs-Portable Data Logger" & !is.na(StatisticalBaseCode)) |
(ActivityTypeCode == "Field Msr/Obs-Portable Data Logger" & ResultValueTypeName == "Calculated") |
(ActivityTypeCode == "Field Msr/Obs-Portable Data Logger" & ResultValueTypeName == "Estimated") |
(SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(ResultTimeBasisText)) |
(SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(StatisticalBaseCode)) |
(SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Calculated") |
(SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Estimated")) %>%
dplyr::mutate(TADA.ContinuousData.Flag = "Continuous")
# everything not YET in cont dataframe
noncont.data <- subset(.data, !.data$ResultIdentifier %in% cont.data$ResultIdentifier)
# if time field is not NA, find time difference between results
if (length(noncont.data) >= 1) {
info_match <- noncont.data %>%
# remove quality control samples
dplyr::filter(TADA.ActivityType.Flag == "Non_QC") %>%
dplyr::group_by(
TADA.LatitudeMeasure, TADA.LongitudeMeasure,
OrganizationIdentifier, TADA.ComparableDataIdentifier,
TADA.ActivityDepthHeightMeasure.MeasureValue,
TADA.ResultDepthHeightMeasure.MeasureValue,
TADA.ActivityBottomDepthHeightMeasure.MeasureValue,
TADA.ActivityTopDepthHeightMeasure.MeasureValue,
ActivityRelativeDepthName
) %>%
dplyr::mutate(n_records = length(TADA.ResultMeasureValue)) %>%
dplyr::mutate(group_id = dplyr::cur_group_id()) %>%
dplyr::filter(n_records > 1) %>%
dplyr::ungroup() %>%
dplyr::group_by(group_id) %>%
dplyr::arrange(ActivityStartDateTime, .by_group = TRUE) %>%
dplyr::mutate(
time_diff_lag = abs(difftime(ActivityStartDateTime, dplyr::lag(ActivityStartDateTime), units = "hours")),
time_diff_lead = abs(difftime(ActivityStartDateTime, dplyr::lead(ActivityStartDateTime), units = "hours"))
) %>%
dplyr::ungroup()
# find results where the time differences is <= time_difference (default is 4 hours)
within_window <- info_match %>%
dplyr::filter(time_diff_lead <= time_difference |
time_diff_lag <= time_difference)
rm(info_match)
# if matches are identified change flag to continuous
noncont.data <- noncont.data %>%
dplyr::mutate(TADA.ContinuousData.Flag = ifelse(ResultIdentifier %in% within_window$ResultIdentifier,
"Continuous", TADA.ContinuousData.Flag
))
rm(within_window)
}
# check if noncont.data is blank. If TRUE, flag.data = cont.data
if (nrow(noncont.data) == 0) {
print("All data is flagged as continuous in TADA.ContinuousData.Flag column.")
flag.data <- cont.data
}
# if noncont.data is NOT blank, flag.data = join of noncont.data with cont.data
if (nrow(noncont.data) != 0) {
flag.data <- cont.data %>%
dplyr::full_join(noncont.data, by = c(names(cont.data)))
}
# flagged output, all data
if (clean == FALSE & flaggedonly == FALSE) {
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
# clean output
if (clean == TRUE & flaggedonly == FALSE) {
# filter out Suspect characteristic-unit-media combinations
clean.data <- flag.data %>%
dplyr::filter(!(TADA.ContinuousData.Flag %in% "Continuous")) %>%
dplyr::select(-TADA.ContinuousData.Flag) %>%
TADA_OrderCols()
return(clean.data)
}
# flagged output, only aggregated continuous data
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only Suspect characteristic-unit-media combinations
onlycont.data <- flag.data %>%
dplyr::filter(TADA.ContinuousData.Flag == "Continuous") %>%
TADA_OrderCols()
return(onlycont.data)
}
# if no aggregated continuous data is in the data set
if (nrow(flag.data[flag.data$TADA.ContinuousData.Flag == "Continuous", ]) == 0) {
if (flaggedonly == FALSE) {
print("No evidence of aggregated continuous data in your dataframe. Returning the input dataframe with TADA.ContinuousData.Flag column for tracking.")
.data <- TADA_OrderCols(.data)
return(.data)
}
if (flaggedonly == TRUE) {
print("This dataframe is empty because we did not find any aggregated continuous data in your dataframe")
all.cont.data <- flag.data %>%
dplyr::filter(TADA.ContinuousData.Flag == "Continuous")
return(all.cont.data)
}
}
}
#' Check Result Value Against WQX Upper Threshold
#'
#' EPA's Water Quality Exchange (WQX) has generated maximum and minimum thresholds
#' for each parameter and unit combination from millions of water quality data
#' points around the country. This function leverages the WQX QAQC Validation Table
#' to flag any data that is above the upper threshold of result values submitted
#' to WQX for a given characteristic.
#'
#' When clean = FALSE and flaggedonly = FALSE, a column which flags data above
#' the upper WQX threshold is appended to the dataframe. When clean = FALSE and
#' flaggedonly = TRUE, the dataframe is filtered to show only data found above
#' the WQX threshold. When clean = TRUE and flaggedonly = FALSE, rows with values
#' that are above the upper WQX threshold are removed from the dataframe and no
#' column is appended. When clean = TRUE and and flaggedonly = TRUE, the function
#' is not executed and an error message is returned. Defaults are clean = FALSE
#' and flaggedonly = FALSE.
#'
#' This function will add the column "TADA.ResultValueAboveUpperThreshold.Flag" which
#' will be populated with the values: "Pass", "Suspect", "Not Reviewed", or
#' "NA - Not Available". The “Not Reviewed” value means that the EPA WQX team
#' has not yet reviewed the range yet for the characteristic and unit combination combination
#' in that row (see https://cdx.epa.gov/wqx/download/DomainValues/QAQCCharacteristicValidation.CSV).
#' The WQX team plans to review and update these new combinations quarterly. The
#' "NA - Not Available" flag means that the characteristic, media, and/or unit combination
#' for that row is not fully populated (is NA or does not match the WQX data standard)
#' or the result value is NA.
#'
#' If this function is run more than once on the same dataframe,
#' the flag column will be deleted and regenerated.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument; removes data that is above the upper WQX
#' threshold from the dataframe when clean = TRUE. Default is clean = FALSE.
#' @param flaggedonly Boolean argument; filters dataframe to show only the data
#' flagged as above the upper WQX threshold. Default is flaggedonly = FALSE.
#' @return The input TADA dataset with the added "TADA.ResultValueAboveUpperThreshold.Flag"
#' column which is populated with the values: "Pass", "Suspect", "Not Reviewed", or
#' "NA - Not Available". Defaults are clean = FALSE and flaggedonly = FALSE.
#' When clean = FALSE and flaggedonly = TRUE, the dataframe
#' is filtered to show only data found above the WQX threshold. When clean = TRUE
#' and flaggedonly = FALSE, rows with values that are above the upper WQX threshold
#' are removed from the dataframe. When clean = TRUE and and flaggedonly = TRUE,
#' the function is not executed and an error message is returned.
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(Data_NCTCShepherdstown_HUC12)
#'
#' # Remove data that is above the upper WQX threshold from dataframe:
#' WQXUpperThreshold_clean <- TADA_FlagAboveThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = TRUE
#' )
#'
#' # Flag, but do not remove, data that is above the upper WQX threshold in
#' # new column titled "TADA.ResultValueAboveUpperThreshold.Flag":
#' WQXUpperThreshold_flags <- TADA_FlagAboveThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = FALSE
#' )
#'
#' # Show only data flagged as above the upper WQX threshold:
#' WQXUpperThreshold_flagsonly <- TADA_FlagAboveThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = FALSE, flaggedonly = TRUE
#' )
#'
TADA_FlagAboveThreshold <- function(.data, clean = FALSE, flaggedonly = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check flaggedonly is boolean
TADA_CheckType(flaggedonly, "logical")
# check .data has required columns
required_cols <- c(
"TADA.CharacteristicName", "TADA.ActivityMediaName", "TADA.ResultMeasureValue",
"TADA.ResultMeasure.MeasureUnitCode"
)
TADA_CheckColumns(.data, required_cols)
# check that clean and flaggedonly are not both TRUE
if (clean == TRUE & flaggedonly == TRUE) {
stop("Function not executed because clean and flaggedonly cannot both be TRUE")
}
# check ResultMeasureValue column is of class numeric
if (!is.numeric(.data$TADA.ResultMeasureValue)) {
stop("The ResultMeasureValue column must be of class 'numeric'.")
}
# execute function after checks are passed
# delete existing flag column - removes flag column in case reference table has changed.
if (("TADA.ResultValueAboveUpperThreshold.Flag" %in% colnames(.data)) == TRUE) {
.data <- dplyr::select(.data, -TADA.ResultValueAboveUpperThreshold.Flag)
}
# get WQXcharVal.ref and filter to include only CharacteristicUnit domain
# Note that status is not applicable to ranges (only unit/char combos)
# Any with "suspect" char/unit combos should also be flagged as suspect
unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "EPATADA")) %>%
dplyr::filter(Type == "CharacteristicUnit")
# update ref table names to prepare for left join with df
names(unit.ref)[names(unit.ref) == "Characteristic"] <- "TADA.CharacteristicName"
names(unit.ref)[names(unit.ref) == "Source"] <- "TADA.ActivityMediaName"
names(unit.ref)[names(unit.ref) == "Value.Unit"] <- "TADA.ResultMeasure.MeasureUnitCode"
# remove extraneous columns from unit.ref
unit.ref <- dplyr::select(unit.ref, c(
-Domain, -Status,
-Type, -Unique.Identifier,
-Note.Recommendation,
-Conversion.Factor,
-Conversion.Coefficient,
-Last.Change.Date,
-Value,
-Minimum
))
# remove duplicates
unit.ref <- unique(unit.ref)
# separate out valid ranges from invalid units
unit.ref <- unit.ref %>%
dplyr::filter(TADA.WQXVal.Flag == "Pass")
check.data <- dplyr::left_join(.data,
unit.ref,
by = c(
"TADA.CharacteristicName",
"TADA.ActivityMediaName",
"TADA.ResultMeasure.MeasureUnitCode"
),
multiple = "all",
relationship = "many-to-one"
)
# Create flag column, flag rows where ResultMeasureValue > Maximum
flag.data <- check.data %>%
# create flag column
dplyr::mutate(TADA.ResultValueAboveUpperThreshold.Flag = dplyr::case_when(
TADA.ResultMeasureValue > Maximum ~ as.character("Suspect"),
TADA.ResultMeasureValue <= Maximum ~ as.character("Pass"),
is.na(Maximum) ~ as.character("Not Reviewed"), # in QAQC table, but not yet reviewed
TRUE ~ as.character("NA - Not Available") # this occurs when the char/unit/media combo is not in the WQX QAQC table at all. USGS data may not be in QAQC table because it does not adhere to the WQX domain tables.
))
# # Add different flags for unit issues (ranges also suspect)
# flag.data <- flag.data %>%
# dplyr::mutate(TADA.ResultValueAboveUpperThreshold.Flag = dplyr::case_when(
# TADA.WQXVal.Flag == "Suspect" ~ as.character("Suspect: Invalid Unit"), # suspect char/unit combos are also suspect ranges
# TADA.WQXVal.Flag == "NonStandardized" ~ as.character("Suspect: NonStandardized Unit")
# ))
# remove Maximum and TADA.WQXVal.Flag column from unit.ref
flag.data <- flag.data %>%
dplyr::select(c(
-"Maximum",
-"TADA.WQXVal.Flag"
))
# if no data above WQX threshold is found
if (any("Suspect" %in%
unique(flag.data$TADA.ResultValueAboveUpperThreshold.Flag)) == FALSE) {
if (flaggedonly == FALSE) {
print("No data above the WQX Upper Threshold was found in your dataframe. Returning the input dataframe with TADA.ResultValueAboveUpperThreshold.Flag column for tracking.")
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
if (flaggedonly == TRUE) {
print("This dataframe is empty because no data above the WQX Upper Threshold was found in your dataframe")
emptyflag.data <- dplyr::filter(flag.data, TADA.ResultValueAboveUpperThreshold.Flag %in% "Suspect")
emptyflag.data <- TADA_OrderCols(emptyflag.data)
return(emptyflag.data)
}
}
# flagged, all data
if (clean == FALSE & flaggedonly == FALSE) {
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
# clean data
if (clean == TRUE & flaggedonly == FALSE) {
# filter out rows where TADA.ResultValueAboveUpperThreshold.Flag = Suspect; remove TADA.ResultValueAboveUpperThreshold.Flag column
clean.data <- flag.data %>%
dplyr::filter(!(TADA.ResultValueAboveUpperThreshold.Flag %in% "Suspect")) # %>%
# dplyr::select(-TADA.ResultValueAboveUpperThreshold.Flag)
clean.data <- TADA_OrderCols(clean.data)
return(clean.data)
}
# flagged, errors only
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only rows above WQX upper threshold
flagsonly.data <- flag.data %>%
dplyr::filter(TADA.ResultValueAboveUpperThreshold.Flag %in% "Suspect")
flagsonly.data <- TADA_OrderCols(flagsonly.data)
return(flagsonly.data)
}
}
#' Check Result Value Against WQX Lower Threshold
#'
#' EPA's Water Quality Exchange (WQX) has generated maximum and minimum thresholds
#' for each parameter and unit combination from millions of water quality data
#' points around the country. This function leverages the WQX QAQC Validation Table
#' to flag any data that is below the lower threshold of result values submitted
#' to WQX for a given characteristic.
#'
#' When clean = FALSE and flaggedonly = FALSE, a column which flags data below
#' the lower WQX threshold is appended to the dataframe. When clean = FALSE and
#' flaggedonly = TRUE, the dataframe is filtered to show only data found below
#' the WQX threshold. When clean = TRUE and flaggedonly = FALSE, rows with values
#' that are below the upper WQX threshold are removed from the dataframe and no
#' column is appended. When clean = TRUE and and flaggedonly = TRUE, the function
#' is not executed and an error message is returned. Defaults are clean = FALSE
#' and flaggedonly = FALSE.
#'
#' This function will add the column "TADA.ResultValueBelowLowerThreshold.Flag" which
#' will be populated with the values: "Pass", "Suspect", "Not Reviewed", or
#' "NA - Not Available". The “Not Reviewed” value means that the EPA WQX team
#' has not yet reviewed the range yet for the characteristic and unit combination combination
#' in that row (see https://cdx.epa.gov/wqx/download/DomainValues/QAQCCharacteristicValidation.CSV).
#' The WQX team plans to review and update these new combinations quarterly. The
#' "NA - Not Available" flag means that the characteristic, media, and/or unit combination
#' for that row is not fully populated (is NA or does not match the WQX data standard)
#' or the result value is NA.
#'
#' If this function is run more than once on the same dataframe,
#' the flag column will be deleted and regenerated.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument; removes data that is below the lower WQX
#' threshold from the dataframe when clean = TRUE. Default is clean = FALSE.
#' @param flaggedonly Boolean argument; filters dataframe to show only the data
#' flagged as below the lower WQX threshold. Default is flaggedonly = FALSE.
#' @return The input TADA dataset with the added "TADA.ResultValueBelowLowerThreshold.Flag"
#' column which is populated with the values: "Pass", "Suspect", "Not Reviewed", or
#' "NA - Not Available". Defaults are clean = FALSE and flaggedonly = FALSE.
#' When clean = FALSE and flaggedonly = TRUE, the dataframe
#' is filtered to show only data found below the WQX threshold. When clean = TRUE
#' and flaggedonly = FALSE, rows with values that are below the lower WQX threshold
#' are removed from the dataframe. When clean = TRUE and and flaggedonly = TRUE,
#' the function is not executed and an error message is returned.
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(Data_NCTCShepherdstown_HUC12)
#'
#' # Remove data that is below the lower WQX threshold from the dataframe:
#' WQXLowerThreshold_clean <- TADA_FlagBelowThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = TRUE
#' )
#'
#' # Flag, but do not remove, data that is below the lower WQX threshold in
#' # new column titled "TADA.ResultValueBelowLowerThreshold.Flag":
#' WQXLowerThreshold_flags <- TADA_FlagBelowThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = FALSE
#' )
#'
#' # Show only data that is below the lower WQX threshold:
#' WQXLowerThreshold_flagsonly <- TADA_FlagBelowThreshold(
#' Data_NCTCShepherdstown_HUC12,
#' clean = FALSE, flaggedonly = TRUE
#' )
#'
TADA_FlagBelowThreshold <- function(.data, clean = FALSE, flaggedonly = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check flaggedonly is boolean
TADA_CheckType(flaggedonly, "logical")
# check .data has required columns
required_cols <- c(
"TADA.CharacteristicName", "TADA.ActivityMediaName", "TADA.ResultMeasureValue",
"TADA.ResultMeasure.MeasureUnitCode"
)
TADA_CheckColumns(.data, required_cols)
# check that clean and flaggedonly are not both TRUE
if (clean == TRUE & flaggedonly == TRUE) {
stop("Function not executed because clean and flaggedonly cannot both be TRUE")
}
# check ResultMeasureValue column is of class numeric
if (!is.numeric(.data$TADA.ResultMeasureValue)) {
stop("The ResultMeasureValue column must be of class 'numeric'.")
}
# execute function after checks are passed - removes flag column in case reference table has changed.
# delete existing flag column
if (("TADA.ResultValueBelowLowerThreshold.Flag" %in% colnames(.data)) == TRUE) {
.data <- dplyr::select(.data, -TADA.ResultValueBelowLowerThreshold.Flag)
}
# get WQXcharVal.ref and filter to include only CharacteristicUnit
# Note that status is not applicable to ranges.
# Instead, we generate a validation flag later in this function
unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "EPATADA")) %>%
dplyr::filter(
Type == "CharacteristicUnit"
)
# update ref table names to prepare for left join with df
names(unit.ref)[names(unit.ref) == "Characteristic"] <- "TADA.CharacteristicName"
names(unit.ref)[names(unit.ref) == "Source"] <- "TADA.ActivityMediaName"
names(unit.ref)[names(unit.ref) == "Value.Unit"] <- "TADA.ResultMeasure.MeasureUnitCode"
# remove extraneous columns from unit.ref
unit.ref <- dplyr::select(unit.ref, c(
-Type, -Unique.Identifier, -Domain,
-Status,
-Note.Recommendation,
-Conversion.Factor,
-Conversion.Coefficient,
-Last.Change.Date,
-Value,
-Maximum
))
unit.ref <- unique(unit.ref)
# separate out valid ranges from invalid units
unit.ref <- unit.ref %>%
dplyr::filter(TADA.WQXVal.Flag == "Pass")
check.data <- dplyr::left_join(.data,
unit.ref,
by = c(
"TADA.CharacteristicName",
"TADA.ActivityMediaName",
"TADA.ResultMeasure.MeasureUnitCode"
),
multiple = "all",
relationship = "many-to-one"
)
# Create flag column, flag rows where TADA.ResultMeasureValue < Minimum
flag.data <- check.data %>%
# create flag column
dplyr::mutate(TADA.ResultValueBelowLowerThreshold.Flag = dplyr::case_when(
TADA.ResultMeasureValue < Minimum ~ as.character("Suspect"),
TADA.ResultMeasureValue >= Minimum ~ as.character("Pass"),
is.na(Minimum) ~ as.character("Not Reviewed"), # in QAQC table but not reviewed
TRUE ~ as.character("NA - Not Available") # this occurs when the char/unit/media combo is not in the WQX QAQC table at all. USGS data may not be in QAQC table because it does not adhere to the WQX domain tables.
))
# # Add different flags for unit issues (ranges also suspect)
# flag.data <- flag.data %>%
# dplyr::mutate(TADA.ResultValueAboveBelowThreshold.Flag = dplyr::case_when(
# TADA.WQXVal.Flag == "Suspect" ~ as.character("Suspect: Invalid Unit"), # suspect char/unit combos are also suspect ranges
# TADA.WQXVal.Flag == "NonStandardized" ~ as.character("Suspect: NonStandardized Unit")
# ))
# remove Min column
flag.data <- flag.data %>%
dplyr::select(c(
-"Minimum",
-"TADA.WQXVal.Flag"
))
# if no data below WQX lower threshold is found
if (any("Suspect" %in%
unique(flag.data$TADA.ResultValueBelowLowerThreshold.Flag)) == FALSE) {
if (flaggedonly == FALSE) {
print("No data below the WQX Lower Threshold were found in your dataframe. Returning the input dataframe with TADA.ResultValueBelowLowerThreshold.Flag column for tracking.")
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
if (flaggedonly == TRUE) {
print("This dataframe is empty because no data below the WQX Lower Threshold was found in your dataframe")
emptyflag.data <- dplyr::filter(flag.data, TADA.ResultValueBelowLowerThreshold.Flag %in% "Suspect")
emptyflag.data <- TADA_OrderCols(emptyflag.data)
return(emptyflag.data)
}
}
# flagged, all data
if (clean == FALSE & flaggedonly == FALSE) {
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
# clean data
if (clean == TRUE & flaggedonly == FALSE) {
# filter out rows where TADA.ResultValueBelowLowerThreshold.Flag = Suspect; remove TADA.ResultValueBelowLowerThreshold.Flag column
clean.data <- flag.data %>%
dplyr::filter(!(TADA.ResultValueBelowLowerThreshold.Flag %in% "Suspect"))
clean.data <- TADA_OrderCols(clean.data)
return(clean.data)
}
# only flagged data
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only rows where TADA.ResultValueBelowLowerThreshold.Flag = Suspect
flagsonly.data <- flag.data %>%
dplyr::filter(TADA.ResultValueBelowLowerThreshold.Flag %in% "Suspect")
flagsonly.data <- TADA_OrderCols(flagsonly.data)
return(flagsonly.data)
}
}
#' Check data for an approved QAPP
#'
#' Function checks data submitted under the column "QAPPApprovedIndicator".
#' Some organizations submit data for this field to indicate if the data
#' produced has an approved Quality Assurance Project Plan (QAPP) or not.
#' Y indicates yes, N indicates no. This function has three default inputs:
#' clean = TRUE, cleanNA = FALSE, and flaggedonly == FALSE. The default flags
#' rows of data where the QAPPApprovedIndicator equals "N". Users could
#' remove NA's in addition to N's using the inputs clean = TRUE, cleanNA = TRUE,
#' and flaggedonly = FALSE. If flaggedonly = TRUE, the function will filter out all
#' rows where the QAPPApprovedIndicator is 'Y'. If clean = FALSE, cleanNA = FALSE,
#' and flaggedonly = FALSE, the function will not make any changes to the data.
#'
#' Note: This is not a required field, so it is often left blank (NA) even if
#' the data has an associated QAPP. All states and tribes that collect
#' monitoring data using 106 funding (almost all state and tribal data in WQX)
#' are required to have an EPA approved QAPP to receive 106 funding. Therefore,
#' most of these organizations data has an approved QAPP even if the data
#' submitted to WQP is NA.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument with two possible values called "TRUE" and
#' "FALSE". When clean=TRUE, rows of data where the QAPPApprovedIndicator equals
#' "N" will be removed. When, clean=FALSE, rows of data where the
#' QAPPApprovedIndicator equals "N" will be retained.
#' @param cleanNA Boolean argument with two possible values called "TRUE" and
#' "FALSE". When cleanNA=TRUE, rows of data where the QAPPApprovedIndicator
#' equals "NA" will be removed. When, cleanNA=FALSE, rows of data where the
#' the QAPPApprovedIndicator equals "NA" will be retained.
#' @param flaggedonly Boolean argument; when flaggedonly = TRUE, the dataframe will
#' be filtered to remove any rows where the QAPPApprovedIndicator equals "Y".
#'
#' @return Several combinations of inputs are possible:
#' When clean = TRUE, cleanNA = FALSE, and flaggedonly = FALSE, the dataframe will
#' be filtered to show only rows where QAPPAprrovedIndicator is "Y" or "NA";
#' When clean = TRUE, cleanNA = TRUE, and flaggedonly = FALSE, the dataframe will
#' be filtered to show only rows where QAPPApprovedIndicator is "Y";
#' When clean = FALSE, cleanNA = TRUE, and flaggedonly = FALSE, the dataframe will
#' be filtered to show only rows where QAPPApprovedIndicator is "Y" or "N";
#' When clean = FALSE, cleanNA = FALSE, and flaggedonly = FALSE, no rows are
#' removed from the dataframe;
#' When clean = TRUE, cleanNA = TRUE, and flaggedonly = TRUE, the function will
#' not execute and an error message will be returned;
#' When clean = TRUE, cleanNA = FALSE, and flaggedonly = TRUE, the dataframe will
#' be filtered to show only rows where QAPPApprovedIndicator is "NA";
#' When clean = FALSE, cleanNA = TRUE, and flaggedonly = TRUE, the dataframe will
#' be filtered to show only rows where QAPPApprovedIndicator is "N";
#' When clean = FALSE, cleanNA = FALSE, and flaggedonly = TRUE, the dataframe will
#' be filtered to show only rows where QAPPApprovedIndicator is "N" or "NA"
#'
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(Data_Nutrients_UT)
#'
#' # Show data where the QAPPApprovedIndicator equals "Y" or "NA":
#' QAPPapproved_clean <- TADA_FindQAPPApproval(Data_Nutrients_UT)
#'
#' # Show only data where the QAPPApprovedIndicator equals "Y":
#' QAPPapproved_cleanNAs <- TADA_FindQAPPApproval(Data_Nutrients_UT, cleanNA = TRUE)
#'
#' # Show data where the QAPPApprovedIndicator equals "N" or "NA":
#' QAPPIndicator_N_NA <- TADA_FindQAPPApproval(Data_Nutrients_UT,
#' clean = FALSE,
#' cleanNA = FALSE, flaggedonly = TRUE
#' )
#'
#' # Show data where the QAPPApprovedIndicator equals "N":
#' QAPPIndicator_N <- TADA_FindQAPPApproval(Data_Nutrients_UT,
#' clean = FALSE,
#' cleanNA = TRUE, flaggedonly = TRUE
#' )
#'
#' # Note: When clean = FALSE, cleanNA = FALSE, and flaggedonly = FALSE, no data is removed
#' # Note: When clean = TRUE, cleanNA = TRUE, and flaggedonly = TRUE, an error message is returned
#'
TADA_FindQAPPApproval <- function(.data, clean = FALSE, cleanNA = FALSE, flaggedonly = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check cleanNA is boolean
TADA_CheckType(cleanNA, "logical")
# check flaggedonly is boolean
TADA_CheckType(flaggedonly, "logical")
# check .data has required columns
TADA_CheckColumns(.data, "QAPPApprovedIndicator")
# check that clean, cleanNA and flaggedonly are not all TRUE
if (clean == TRUE & cleanNA == TRUE & flaggedonly == TRUE) {
stop("Function not executed because clean, cleanNA, and flaggedonly cannot all be TRUE")
}
# execute function after checks are passed
# if flaggedonly = FALSE
if (flaggedonly == FALSE) {
if (clean == TRUE) {
.data <- dplyr::filter(.data, is.na(QAPPApprovedIndicator) == TRUE | QAPPApprovedIndicator == "Y")
if (nrow(.data) == 0) {
print("All QAPPApprovedIndicator data is N")
}
}
if (cleanNA == TRUE) {
.data <- dplyr::filter(.data, is.na(QAPPApprovedIndicator) == FALSE)
if (nrow(.data) == 0 & clean == TRUE) {
print("All QAPPApprovedIndicator data is NA or N")
} else if (nrow(.data) == 0 & clean == FALSE) {
print("All QAPPApprovedIndicator data is NA")
}
}
if (clean == FALSE & cleanNA == FALSE) {
print("Data is flagged but not removed because clean and cleanNA were FALSE")
}
.data <- TADA_OrderCols(.data)
return(.data)
}
# if flaggedonly = TRUE
if (flaggedonly == TRUE & clean == TRUE & cleanNA == FALSE) {
NA.data <- dplyr::filter(.data, is.na(QAPPApprovedIndicator) == TRUE)
if (nrow(NA.data) == 0) {
warning("All QAPPApprovedIndicator data is 'Y' or 'N'")
}
NA.data <- TADA_OrderCols(NA.data)
return(NA.data)
}
if (flaggedonly == TRUE & clean == FALSE & cleanNA == TRUE) {
N.data <- dplyr::filter(.data, QAPPApprovedIndicator == "N")
if (nrow(N.data) == 0) {
warning("All QAPPApprovedIndicator data is NA or 'Y'")
}
N.data <- TADA_OrderCols(N.data)
return(N.data)
}
if (flaggedonly == TRUE & clean == FALSE & cleanNA == FALSE) {
NAorN.data <- dplyr::filter(.data, is.na(QAPPApprovedIndicator) == TRUE | QAPPApprovedIndicator == "N")
if (nrow(NAorN.data) == 0) {
warning("All QAPPApprovedIndicator data is 'Y'")
}
NAorN.data <- TADA_OrderCols(NAorN.data)
return(NAorN.data)
}
}
#' Check if an approved QAPP document URL is provided
#'
#' Function checks data submitted under the "ProjectFileUrl" column
#' to determine if a QAPP document is available to review. When clean = FALSE,
#' a column will be appended to flag results that have an associated
#' QAPP document URL provided. When clean = TRUE, rows that do not
#' have an associated QAPP document are removed from the dataframe and no column
#' will be appended. This function should only be used to remove data if an
#' accompanying QAPP document is required to use data in assessments.
#'
#' @param .data TADA dataframe
#' @param clean Boolean argument; removes data without an associated QAPP
#' document from the dataframe when clean = TRUE. Default is clean = FALSE.
#'
#' @return Returns input dataframe with the added "TADA.QAPPDocAvailable" column.
#' When clean = FALSE, no data are removed and the TADA.QAPPDocAvailable column
#' flags rows with an associated QAPP document. When clean = TRUE,
#' data without an associated QAPP document are removed from the dataframe.
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(Data_Nutrients_UT)
#'
#' # Flag, but do not remove, data without an associated QAPP document in
#' # new column titled "TADA.QAPPDocAvailable":
#' FlagData_MissingQAPPDocURLs <- TADA_FindQAPPDoc(Data_Nutrients_UT)
#'
#' # Remove data without an associated QAPP document available:
#' RemoveData_MissingQAPPDocURLs <- TADA_FindQAPPDoc(Data_Nutrients_UT, clean = TRUE)
#'
TADA_FindQAPPDoc <- function(.data, clean = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
TADA_CheckType(clean, "logical")
# check .data has required columns
# generate required column if it does not exist (there is no project data)
if ("ProjectFileUrl" %in% colnames(.data)) {
.data <- .data
} else {
# create empty ProjectFileUrl column
.data[, "ProjectFileUrl"] <- NA
}
# check .data has required columns
TADA_CheckColumns(.data, "ProjectFileUrl")
# default flag column
.data$TADA.QAPPDocAvailable <- "N"
# execute function after checks are passed
# flag data where QAPP document url is provided
# make QAPPdoc.data dataframe
QAPPdoc.data <- dplyr::filter(.data, grepl("/", ProjectFileUrl))
NQAPPdoc.data <- subset(.data, !.data$ResultIdentifier %in% QAPPdoc.data$ResultIdentifier)
# if there is data without an associated QAPP url in the data set
if (nrow(QAPPdoc.data) != 0) {
# change flag column
QAPPdoc.data$TADA.QAPPDocAvailable <- "Y_ProjectFileUrlProvided"
# join QAPPdoc.data to flag.data
flag.data <- plyr::rbind.fill(QAPPdoc.data, NQAPPdoc.data)
# flagged output
if (clean == FALSE) {
flag.data <- TADA_OrderCols(flag.data)
return(flag.data)
}
# clean output
if (clean == TRUE) {
# remove data without an associated QAPP url
clean.data <- dplyr::filter(flag.data, grepl("/", ProjectFileUrl))
# remove TADA.QAPPDocAvailable column
# clean.data <- dplyr::select(clean.data, -TADA.QAPPDocAvailable)
clean.data <- TADA_OrderCols(clean.data)
return(clean.data)
}
}
# if no associated QAPP url data is in the data set
if (nrow(QAPPdoc.data) == 0) {
if (clean == FALSE) {
print("No QAPP document url data found in your dataframe. Returning input dataframe with TADA.QAPPDocAvailable column for tracking.")
.data <- TADA_OrderCols(.data)
return(.data)
}
if (clean == TRUE) {
print("This dataframe is empty because we did not find any QAPP document url data in your dataframe")
QAPPdoc.data <- TADA_OrderCols(QAPPdoc.data)
return(QAPPdoc.data)
}
}
}
#' Suspect coordinates
#'
#' This function identifies and flags Suspect coordinate data. When
#' clean_outsideUSA = "no" and clean_imprecise = FALSE,
#' a column will be appended titled "TADA.SuspectCoordinates.Flag" with the following
#' flags: 1) If the latitude is less than zero, the row will be
#' flagged with "LAT_OutsideUSA" (with the exception of American Samoa,
#' Northern Mariana Islands, and Guam), 2) If the longitude is greater than zero AND less than 145,
#' the row will be flagged as "LONG_OutsideUSA" (with the exception of
#' American Samoa, Northern Mariana Islands, and Guam), and 3) Finally,
#' precision can be measured by the number of decimal places in the latitude and longitude
#' provided. If either the latitude or longitude does not have at least three numbers to the
#' right of the decimal point, the row will be flagged as "Imprecise_lessthan3decimaldigits". Occasionally
#' latitude and longitude measurements are flagged as outside of the United States
#' because the data was entered as negative when it should be positive or vice versa.
#' This function offers the option of clean_outsideUSA = "change sign" to fix this
#' issue. However, data owners should fix the raw data through WQX. For assistance
#' with changing raw data, email the WQX help desk: \email{WQX@@epa.gov}
#'
#' @param .data TADA dataframe
#' @param clean_outsideUSA Character argument with options "no", "remove", and "change sign";
#' flags coordinates as outside the USA when clean_outsideUSA = "no";
#' removes data with coordinates outside of the United States when clean_outsideUSA = "remove";
#' changes sign of lat/long coordinates flagged as outside the USA when
#' clean_outside = "change sign"; Default is clean_outsideUSA = "no".
#' @param clean_imprecise Boolean argument; removes imprecise data when
#' clean_imprecise = TRUE. Default is clean_imprecise = FALSE.
#' @param flaggedonly Boolean argument; Return only flagged data when flaggedonly = TRUE;
#' default is flaggedonly = FALSE.
#'
#' @return Returns input TADA dataset with the added "TADA.SuspectCoordinates.Flag" column.
#' When clean_outsideUSA is "no", "change sign", or clean_imprecise argument is FALSE,
#' a column flagging rows with the respective QA check is appended to the input
#' dataframe. When clean_outsideUSA is "remove" or clean_imprecise is TRUE,
#' "Suspect" or "imprecise" data is removed, respectively. When flaggedonly is TRUE,
#' the dataframe will be filtered to show only the data flagged as Suspect, imprecise,
#' or out of the United States. Defaults are clean_outsideUSA = "no",
#' clean_imprecise = FALSE, and flaggedonly = FALSE.
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(Data_Nutrients_UT)
#'
#' # Flag, but do not remove, data with Suspect coordinates in new column
#' # titled "TADA.SuspectCoordinates.Flag":
#' # Return ALL data:
#' SuspectCoord_flags <- TADA_FlagCoordinates(Data_Nutrients_UT)
#'
#' # Flag, but do not remove, data with Suspect coordinates in new column
#' # titled "TADA.SuspectCoordinates.Flag"
#' # Return ONLY the flagged data:
#' SuspectCoord_flags_flaggedonly <- TADA_FlagCoordinates(Data_Nutrients_UT,
#' flaggedonly = TRUE
#' )
#'
#' # Remove data with coordinates outside the USA, but keep flagged data with
#' # imprecise coordinates:
#' OutsideUSACoord_removed <- TADA_FlagCoordinates(Data_Nutrients_UT,
#' clean_outsideUSA = "remove"
#' )
#'
#' # Change the sign of coordinates flagged as outside the USA and keep all
#' # flagged data:
#' OutsideUSACoord_changed <- TADA_FlagCoordinates(Data_Nutrients_UT,
#' clean_outsideUSA = "change sign"
#' )
#'
#' # Remove data with imprecise coordinates, but keep flagged data with
#' # coordinates outside the USA;
#' # imprecise data may have less than 3 significant figures to the right
#' # of the decimal point:
#' ImpreciseCoord_removed <- TADA_FlagCoordinates(Data_Nutrients_UT,
#' clean_imprecise = TRUE
#' )
#'
#' # Remove data with imprecise coordinates or coordinates outside the USA
#' # from the dataframe:
#' SuspectCoord_removed <- TADA_FlagCoordinates(Data_Nutrients_UT,
#' clean_outsideUSA = "remove", clean_imprecise = TRUE
#' )
#'
TADA_FlagCoordinates <- function(.data,
clean_outsideUSA = c("no", "remove", "change sign"),
clean_imprecise = FALSE,
flaggedonly = FALSE) {
# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean_outsideUSA is character
TADA_CheckType(clean_outsideUSA, "character")
# check clean_imprecise is boolean
TADA_CheckType(clean_imprecise, "logical")
# check .data has required columns
TADA_CheckColumns(.data, c("TADA.LatitudeMeasure", "TADA.LongitudeMeasure"))
# check lat and long are "numeric"
if (!is.numeric(.data$TADA.LongitudeMeasure)) {
warning("TADA.LongitudeMeasure field must be numeric")
}
if (!is.numeric(.data$TADA.LatitudeMeasure)) {
warning("TADA.LatitudeMeasure field must be numeric")
}
# check that clean_outsideUSA is either "no", "remove", or "change sign"
clean_outsideUSA <- match.arg(clean_outsideUSA)
orig_dim <- dim(.data)[1]
# execute function after checks are passed
.data <- .data %>%
dplyr::mutate(TADA.SuspectCoordinates.Flag = dplyr::case_when(
TADA.LatitudeMeasure < -11.046934 & TADA.LatitudeMeasure > -14.548699 & TADA.LongitudeMeasure < -168.1433 & TADA.LongitudeMeasure > -171.089874 ~ NA_character_, # American Samoa
TADA.LatitudeMeasure < 20.553802 & TADA.LatitudeMeasure > 14.110472 & TADA.LongitudeMeasure < 146.064818 & TADA.LongitudeMeasure > 144.886331 ~ NA_character_, # Northern Mariana Islands
TADA.LatitudeMeasure < 13.654383 & TADA.LatitudeMeasure > 13.234189 & TADA.LongitudeMeasure < 144.956712 & TADA.LongitudeMeasure > 144.618068 ~ NA_character_, # Guam
TADA.LatitudeMeasure < 0 ~ "LAT_OutsideUSA",
TADA.LongitudeMeasure > 0 & TADA.LongitudeMeasure < 145 ~ "LONG_OutsideUSA",
# for below, lat and long fields must be numeric
# this checks if there are at least 3 significant figures to the
# right of the decimal point
sapply(.data$TADA.LatitudeMeasure, TADA_DecimalPlaces) < 3 |
sapply(.data$TADA.LongitudeMeasure, TADA_DecimalPlaces) < 3 ~ "Imprecise_lessthan3decimaldigits"
))
# Fill in flag for coordinates that appear OK/PASS tests
.data$TADA.SuspectCoordinates.Flag[is.na(.data$TADA.SuspectCoordinates.Flag)] <- "Pass"
# if clean_imprecise is TRUE, remove imprecise station metadata
if (clean_imprecise == TRUE) {
.data <- dplyr::filter(
.data,
!TADA.SuspectCoordinates.Flag %in% "Imprecise_lessthan3decimaldigits"
)
}
# if clean_outsideUSA is "remove", remove stations flagged as outside the USA
if (clean_outsideUSA == "remove") {
.data <- dplyr::filter(
.data,
!TADA.SuspectCoordinates.Flag %in% c("LAT_OutsideUSA", "LONG_OutsideUSA")
)
}
# if clean_outsideUSA is "change sign", change the sign of lat/long coordinates outside of USA
if (clean_outsideUSA == "change sign") {
print("When clean_outsideUSA == change sign, the sign for any lat/long coordinates flagged as outside of USA are switched. This is a temporary solution. Data owners should fix the raw data to address Suspect coordinates through WQX. For assistance fixing data errors you see in the WQP, email the WQX helpdesk (WQX@epa.gov).")
.data <- .data %>%
dplyr::mutate(
TADA.LatitudeMeasure = dplyr::case_when(
TADA.SuspectCoordinates.Flag == "LAT_OutsideUSA" ~ TADA.LatitudeMeasure * (-1),
TRUE ~ TADA.LatitudeMeasure
),
TADA.LongitudeMeasure = dplyr::case_when(
TADA.SuspectCoordinates.Flag == "LONG_OutsideUSA" ~ TADA.LongitudeMeasure * (-1),
TRUE ~ TADA.LongitudeMeasure
)
)
}
# return only flagged data if flaggedonly = true
if ((flaggedonly == TRUE)) {
.data <- dplyr::filter(.data, !TADA.SuspectCoordinates.Flag %in% c("OK"))
}
if (all(.data$TADA.SuspectCoordinates.Flag %in% c("OK")) == TRUE) {
if (orig_dim == dim(.data)[1]) {
print("Your dataframe does not contain monitoring stations with Suspect coordinates. Returning input dataframe with TADA.SuspectCoordinates.Flag column for tracking.")
} else {
print("All Suspect coordinates were removed. Returning input dataframe with TADA.SuspectCoordinates.Flag column for tracking.")
}
}
.data <- TADA_OrderCols(.data)
return(.data)
}
#' Identify Potentially Duplicated Data Uploads by Multiple Organizations
#'
#' Identifies data records uploaded by different organizations with the same date,
#' time, characteristic name, and result value within X meters of each other and
#' flags as potential duplicates. However, it is at the discretion of the data user
#' to determine if the data records are unique or represent overlap that could cause
#' issues in the data analysis.
#'
#' This function runs TADA_FindNearbySites within it which adds the
#' TADA.MonitoringLocationIdentifier field. Duplicates are only flagged as duplicates if
#' the distance between sites is less than the function input dist_buffer
#' (default is 100m). Each group in the TADA.MonitoringLocationIdentifier field indicates
#' that the sites within each group are within the specified distance from each other.
#'
#' We recommend running TADA_FindPotentialDuplicatesMultipleOrgs after running
#' TADA_FindPotentialDuplicatesSingleOrg.
#'
#' @param .data TADA dataframe
#'
#' @param dist_buffer Numeric. The distance in meters below which two sites with
#' measurements at the same time on the same day of the same parameter will
#' be flagged as potential duplicates.
#'
#' @param org_hierarchy Vector of organization identifiers that acts as the
#' order in which the function should select a result as the representative
#' duplicate, based on the organization that collected the data. If left
#' blank, the function chooses the representative duplicate result at random.
#'
#' @return The same input TADA dataframe with four additional columns: a
#' TADA.MultipleOrgDuplicate column indicating if there is evidence that
#' results are likely duplicated due to submission of the same dataset by two
#' or more different organizations, a TADA.MultipleOrgDupGroupID column
#' containing a number unique to results that may represent duplicated
#' measurement events, a TADA.ResultSelectedMultipleOrgs column indicating
#' which rows are selected to keep (Y) and remove (N) based on the
#' org hierarchy, and a TADA.MonitoringLocationIdentifier column indicating which
#' monitoring locations are within the distance buffer from each other.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Load dataset
#' dat <- TADA_DataRetrieval(
#' startDate = "2022-09-01",
#' endDate = "2023-05-01", statecode = "PA", sampleMedia = "Water", ask = FALSE
#' )
#' unique(dat$OrganizationIdentifier)
#' # If duplicates across organizations exist, pick the result belonging
#' # to "21PA_WQX" if available.
#' dat1 <- TADA_FindPotentialDuplicatesMultipleOrgs(dat,
#' dist_buffer = 100, org_hierarchy = c("21PA_WQX")
#' )
#' table(dat1$TADA.ResultSelectedMultipleOrgs)
#' }
#'
TADA_FindPotentialDuplicatesMultipleOrgs <- function(.data, dist_buffer = 100,
org_hierarchy = "none") {
# from those datapoints, determine which are in adjacent sites
if (!"TADA.NearbySites.Flag" %in% names(.data)) {
.data <- TADA_FindNearbySites(.data, dist_buffer = dist_buffer)
}
dupsites <- unique(.data[, c(
"MonitoringLocationIdentifier", "TADA.LatitudeMeasure",
"TADA.LongitudeMeasure", "TADA.MonitoringLocationIdentifier",
"TADA.NearbySiteGroup"
)])
# get rid of results with no site group added - not duplicated spatially
dupsites <- dupsites %>%
dplyr::filter(!is.na(TADA.NearbySiteGroup))
# remove results with no nearby sites get all data that are not NA and round to 2 digits
dupsprep <- .data %>%
dplyr::filter(MonitoringLocationIdentifier %in% dupsites$MonitoringLocationIdentifier) %>%
dplyr::select(
OrganizationIdentifier, ResultIdentifier, ActivityStartDate, ActivityStartTime.Time,
TADA.CharacteristicName, ActivityTypeCode, TADA.ResultMeasureValue,
TADA.MonitoringLocationIdentifier, TADA.NearbySiteGroup
) %>%
dplyr::filter(!is.na(TADA.ResultMeasureValue)) %>%
dplyr::mutate(roundRV = round(TADA.ResultMeasureValue, digits = 2))
# remove intermediate object
rm(dupsites)
# group by date, time, characteristic, and rounded result value and determine the number of
# organizations that have those same row values, and filter to those summary rows with more than
# one organization
dups_sum <- dupsprep %>%
dplyr::group_by(
ActivityStartDate, ActivityStartTime.Time, TADA.CharacteristicName,
ActivityTypeCode, roundRV, TADA.MonitoringLocationIdentifier,
TADA.NearbySiteGroup
) %>%
dplyr::mutate(numorgs = length(unique(OrganizationIdentifier))) %>%
dplyr::filter(numorgs > 1) %>%
# group duplicates
dplyr::mutate(TADA.MultipleOrgDupGroupID = dplyr::cur_group_id()) %>%
dplyr::select(-numorgs) %>%
dplyr::ungroup()
# merge to data
dupsdat <- dplyr::left_join(dups_sum, .data, by = c(
"ActivityStartDate",
"ActivityStartTime.Time",
"TADA.CharacteristicName",
"ActivityTypeCode",
"OrganizationIdentifier",
"ResultIdentifier",
"TADA.ResultMeasureValue",
"TADA.MonitoringLocationIdentifier",
"TADA.NearbySiteGroup"
)) %>%
dplyr::mutate(TADA.MultipleOrgDuplicate = ifelse(is.na(TADA.MultipleOrgDupGroupID), "N", "Y")) %>%
# remove results that are listed twice (as part of two groups)
dplyr::group_by(ResultIdentifier) %>%
dplyr::slice_sample(n = 1) %>%
dplyr::ungroup() %>%
dplyr::select(-roundRV)
# remove intermediate object
rm(dups_sum)
# select representative results
if (dim(dupsdat)[1] > 0) {
# make a selection of a representative result
if (!any(org_hierarchy == "none")) { # if there is an org hierarchy, use that to pick result
# with lowest rank in hierarchy
data_orgs <- unique(.data$OrganizationIdentifier)
if (any(!org_hierarchy %in% data_orgs)) {
print("TADA_FindPotentialDuplicatesMultipleOrgs: One or more organizations in input hierarchy are not present in the input dataset.")
}
hierarchy_df <- data.frame("OrganizationIdentifier" = org_hierarchy, "rank" = 1:length(org_hierarchy))
dupranks <- dupsdat %>%
dplyr::select(ResultIdentifier, OrganizationIdentifier, TADA.MultipleOrgDupGroupID) %>%
dplyr::left_join(hierarchy_df, by = "OrganizationIdentifier")
} else {
dupranks <- dupsdat %>%
dplyr::select(ResultIdentifier, TADA.MultipleOrgDupGroupID) %>%
dplyr::mutate(rank = 99)
}
dupranks$rank[is.na(dupranks$rank)] <- 99
duppicks <- dupranks %>%
dplyr::select(ResultIdentifier, TADA.MultipleOrgDupGroupID, rank) %>%
dplyr::group_by(TADA.MultipleOrgDupGroupID) %>%
dplyr::slice_min(rank) %>%
dplyr::slice_sample(n = 1) %>%
dplyr::ungroup() %>%
dplyr::group_by(ResultIdentifier) %>%
dplyr::slice_min(rank) %>%
dplyr::slice_sample(n = 1)
dupsdat <- dupsdat %>%
dplyr::rename(SingleNearbyGroup = TADA.MonitoringLocationIdentifier) %>%
dplyr::mutate(
TADA.MonitoringLocationIdentifier = paste(SingleNearbyGroup, sep = ","),
TADA.ResultSelectedMultipleOrgs = ifelse(ResultIdentifier %in% duppicks$ResultIdentifier, "Y", "N")
) %>%
dplyr::select(-SingleNearbyGroup)
# connect back to original dataset
.data <- .data %>%
dplyr::mutate(
TADA.MonitoringLocationIdentifier = ifelse(TADA.MonitoringLocationIdentifier %in% NA, "NA", TADA.MonitoringLocationIdentifier)
) %>%
dplyr::full_join(dupsdat, by = c(names(.data))) %>%
dplyr::mutate(
TADA.MultipleOrgDuplicate = ifelse(is.na(TADA.MultipleOrgDuplicate), "N", TADA.MultipleOrgDuplicate),
TADA.ResultSelectedMultipleOrgs = ifelse(is.na(TADA.ResultSelectedMultipleOrgs), "Y", TADA.ResultSelectedMultipleOrgs),
TADA.MultipleOrgDupGroupID = ifelse(is.na(TADA.MultipleOrgDupGroupID), "Not a duplicate", TADA.MultipleOrgDupGroupID)
) %>%
dplyr::mutate(
TADA.MonitoringLocationIdentifier = ifelse(TADA.MonitoringLocationIdentifier %in% "NA", "NA - Not Available", TADA.MonitoringLocationIdentifier)
)
print(paste0(length(dupsdat$TADA.MultipleOrgDuplicate[dupsdat$TADA.MultipleOrgDuplicate %in% c("Y")]), " potentially duplicated results found in dataset. These have been placed into duplicate groups in the TADA.MultipleOrgDupGroupID column and the TADA.MultipleOrgDuplicate column is set to 'Y' (yes). If you provided an organization hierarchy, the result with the lowest ranked organization identifier was selected as the representative result in the TADA.ResultSelectedMultipleOrgs (this column is set to 'Y' for all results either selected or not considered duplicates)."))
} else { # no duplicate results
.data$TADA.MultipleOrgDupGroupID <- "Not a duplicate"
.data$TADA.MultipleOrgDuplicate <- "N"
.data$TADA.ResultSelectedMultipleOrgs <- "Y"
print("No duplicate results detected. Returning input dataframe with duplicate flagging columns set to 'N'.")
}
.data <- TADA_OrderCols(.data)
return(.data)
}
#' Identify Potentially Duplicated Data Uploads by a Single Organization
#'
#' Identifies data records uploaded by the same organization with the same date,
#' time, monitoring location, activity type, characteristic name, fraction,
#' taxonomic name, depth columns, and result value and flags as potential
#' duplicates. However, it is at the discretion of the data user to determine if
#' the data records are unique or represent overlap that could cause issues in
#' the data analysis. Note, the dataset may contain data from multiple
#' organizations: the function performs the same analysis on data from each
#' organization.
#'
#' @param .data TADA dataframe
#' @return The same input TADA dataframe with additional columns: a
#' TADA.SingleOrgDupGroupID column indicating whether a result is part of a
#' group that shares the same date, time, location, characteristic, etc. If
#' multiple rows include duplicates within a single organization, the rows will
#' have the same number identifier in the TADA.SingleOrgDupGroupID column.
#' In addition, the column TADA.SingleOrgDup.Flag is added, which randomly
#' flags rows within each TADA.SingleOrgDupGroupID group for removal. Rows
#' randomly selected for potential removal within a duplicate group will have the
#' TADA.SingleOrgDup.Flag = 'Duplicate' and all other results in the group will have
#' the value TADA.SingleOrgDup.Flag = 'Unique'.
#'
#' @export
#'
#' @examples
#' # Load dataset
#' data(Data_6Tribes_5y)
#' # If duplicates exist, identify and flag them for removal
#' Data_6Tribes_5y_dups <- TADA_FindPotentialDuplicatesSingleOrg(Data_6Tribes_5y)
#' table(Data_6Tribes_5y_dups$TADA.SingleOrgDup.Flag)
#'
TADA_FindPotentialDuplicatesSingleOrg <- function(.data) {
# find the depth columns in the dataset
depthcols <- names(.data)[grepl("^TADA.*DepthHeightMeasure.MeasureValue$", names(.data))]
# tack depth columns onto additional grouping columns
cols <- c("OrganizationIdentifier", "MonitoringLocationIdentifier", "ActivityStartDate", "ActivityStartTime.Time", "ActivityTypeCode", "TADA.CharacteristicName", "SubjectTaxonomicName", "TADA.ResultSampleFractionText", "TADA.ResultMeasureValue", depthcols)
# find where the grouping using the columns above results in more than one result identifier
dups_sum_org <- .data %>%
dplyr::group_by(dplyr::across(tidyselect::any_of(cols))) %>%
dplyr::summarise(
numres = length(unique(ResultIdentifier)),
.groups = "keep"
) %>%
dplyr::filter(numres > 1) %>%
dplyr::mutate(TADA.SingleOrgDupGroupID = dplyr::cur_group_id())
# remove intermediate objects
rm(depthcols, cols)
if (dim(dups_sum_org)[1] > 0) {
# apply to .data and remove numbers column
.data <- merge(.data, dups_sum_org, all.x = TRUE)
.data <- .data %>% dplyr::select(-numres)
.data$TADA.SingleOrgDupGroupID[is.na(.data$TADA.SingleOrgDupGroupID)] <- "Not a duplicate"
# flag rows randomly within a duplicate group for potential removal
dup_rids <- subset(.data, !is.na(.data$TADA.SingleOrgDupGroupID))$ResultIdentifier
picks <- .data %>%
dplyr::filter(!is.na(TADA.SingleOrgDupGroupID)) %>%
dplyr::group_by(TADA.SingleOrgDupGroupID) %>%
dplyr::slice_sample(n = 1)
.data$TADA.SingleOrgDup.Flag <- "Duplicate"
# flags potential duplicates as "Duplicate" for easy filtering
.data$TADA.SingleOrgDup.Flag <- ifelse(.data$ResultIdentifier %in% picks$ResultIdentifier, "Unique", .data$TADA.SingleOrgDup.Flag)
# flags non-duplicates as passing
.data$TADA.SingleOrgDup.Flag <- ifelse(.data$TADA.SingleOrgDupGroupID == "Not a duplicate", "Unique", .data$TADA.SingleOrgDup.Flag)
print(paste0(
"TADA_FindPotentialDuplicatesSingleOrg: ", dim(dups_sum_org)[1],
" groups of potentially duplicated results found in dataset.",
" These have been placed into duplicate groups in the TADA.SingleOrgDupGroupID ",
"column and the function randomly selected one result from each group to ",
"represent a single, unduplicated value. Selected values are indicated in the ",
"TADA.SingleOrgDup.Flag as 'Unique', while duplicates are flagged as 'Duplicate' ",
"for easy filtering."
))
# remove intermediate objects
rm(dup_rids)
}
if (dim(dups_sum_org)[1] == 0) {
# apply to .data and remove numbers column
.data <- merge(.data, dups_sum_org, all.x = TRUE)
.data <- .data %>% dplyr::select(-numres)
.data$TADA.SingleOrgDupGroupID[is.na(.data$TADA.SingleOrgDupGroupID)] <- "Not a duplicate"
# flags non-duplicates as passing
.data$TADA.SingleOrgDup.Flag <- ifelse(.data$TADA.SingleOrgDupGroupID == "Not a duplicate", "Unique", .data$TADA.SingleOrgDup.Flag)
print("No duplicate results detected. Returning input dataframe with TADA.SingleOrgDup.Flag flag column set to 'Unique'")
}
# remove intermediate objects
rm(dups_sum_org)
# reorder columns
.data <- TADA_OrderCols(.data)
# return TADA df
return(.data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.