R/RequiredCols.R

Defines functions TADA_RetainRequired TADA_CheckRequiredFields TADA_GetTemplate TADA_OrderCols

Documented in TADA_CheckRequiredFields TADA_GetTemplate TADA_OrderCols TADA_RetainRequired

# Lists are used within TADA_OrderCols, TADA_GetTemplate, TADA_CheckRequiredFields,
# TADA_AutoFilter, TADA_RetainRequired

# ordered list of TADA workflow required columns to be retained in dataframe
require.cols <- c(
  # Sample/Measurement Type (e.g. QC or Not)
  "ActivityTypeCode", # required
  "TADA.ActivityType.Flag", # generated
  "TADA.ReplicateSampleID", # generated

  # Media
  "ActivityMediaName", # required
  "TADA.ActivityMediaName", # generated/required/replaces original
  "ActivityMediaSubdivisionName", # filter

  # Comparable Data Groups (e.g. Observable Properties)
  "ResultSampleFractionText", # required in Module 1 but is replaced by TADA version in future modules
  "TADA.ResultSampleFractionText", # generated/required/replaces original
  "TADA.SampleFraction.Flag",
  "Target.TADA.ResultSampleFractionText",
  "TADA.FractionAssumptions",
  "CharacteristicName", # required in Module 1 but is replaced by TADA version in future modules
  "TADA.CharacteristicName", # generated/required/replaces original
  "Target.TADA.CharacteristicName",
  "TADA.CharacteristicNameAssumptions", # generated
  "SubjectTaxonomicName",
  "SampleTissueAnatomyName",
  "MethodSpeciationName", # required in Module 1 but is replaced by TADA version in future modules
  "TADA.MethodSpeciationName", # generated/required/replaces original
  "TADA.Target.MethodSpeciationName", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.MethodSpeciation.Flag", # generated
  "Target.TADA.MethodSpeciationName",
  "Target.TADA.SpeciationConversionFactor",
  "TADA.SpeciationAssumptions",
  "TADA.SpeciationUnitConversion", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.SpeciationConversionFactor",
  "TADA.ComparableDataIdentifier",
  "TADA.Harmonized.Flag",
  "TADA.UseForAnalysis.Flag",

  # Result Time
  "ActivityStartDate", # required
  "ActivityStartTime.Time", # filter
  "ActivityStartTime.TimeZoneCode", # filter
  "ActivityStartDateTime", # required/generated by USGS DR

  # Result Value and Result Unit
  "ResultMeasureValue", # required in Module 1 but is replaced by TADA version in future modules
  "ResultMeasure.MeasureUnitCode", # required in Module 1 but is replaced by TADA version in future modules
  "TADA.ResultMeasureValue", # generated/required/replaces original
  "TADA.ResultMeasure.MeasureUnitCode", # generated/required/replaces original
  "TADA.Target.ResultMeasure.MeasureUnitCode", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.WQXUnitConversionFactor", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.WQXUnitConversionCoefficient", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.WQXResultUnitConversion", # generated, only added when transform = FALSE in TADA_ConvertResultUnits
  "TADA.ResultUnit.Flag", # generated
  "ResultValueTypeName", # required
  "TADA.ResultMeasureValueDataTypes.Flag", # generated
  "TADA.ResultValueAboveUpperThreshold.Flag",
  "TADA.ResultValueBelowLowerThreshold.Flag",

  # Detection Limits
  "ResultDetectionConditionText", # required
  "DetectionQuantitationLimitTypeName",
  "DetectionQuantitationLimitMeasure.MeasureValue",
  "DetectionQuantitationLimitMeasure.MeasureUnitCode",
  "TADA.DetectionQuantitationLimitMeasure.MeasureValue",
  "TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode",
  "TADA.DetectionQuantitationLimitMeasure.MeasureValueDataTypes.Flag",
  "TADA.CensoredData.Flag",
  "TADA.CensoredMethod",

  # Result Depth
  "TADA.ConsolidatedDepth",
  "TADA.ConsolidatedDepth.Bottom",
  "TADA.ConsolidatedDepth.Unit",
  "TADA.DepthCategory.Flag",
  "TADA.DepthProfileAggregation.Flag",
  "ResultDepthHeightMeasure.MeasureValue",
  "TADA.ResultDepthHeightMeasure.MeasureValue",
  "TADA.ResultDepthHeightMeasure.MeasureValueDataTypes.Flag",
  "ResultDepthHeightMeasure.MeasureUnitCode",
  "TADA.ResultDepthHeightMeasure.MeasureUnitCode",
  "TADA.WQXConversionFactor.ResultDepthHeightMeasure",
  "ResultDepthAltitudeReferencePointText",

  # Activity Depth
  "ActivityRelativeDepthName", # required
  "ActivityDepthHeightMeasure.MeasureValue", # required
  "TADA.WQXConversionFactor.ActivityDepthHeightMeasure",
  "TADA.ActivityDepthHeightMeasure.MeasureValue", # generated/required/replaces original
  "TADA.ActivityDepthHeightMeasure.MeasureValueDataTypes.Flag", # generated
  "ActivityDepthHeightMeasure.MeasureUnitCode", # required
  "TADA.ActivityDepthHeightMeasure.MeasureUnitCode", # generated/required/replaces original
  "ActivityTopDepthHeightMeasure.MeasureValue", # required
  "TADA.ActivityTopDepthHeightMeasure.MeasureValue", # generated/required/replaces original
  "TADA.WQXConversionFactor.ActivityTopDepthHeightMeasure",
  "TADA.ActivityTopDepthHeightMeasure.MeasureValueDataTypes.Flag", # generated
  "ActivityTopDepthHeightMeasure.MeasureUnitCode", # required
  "TADA.ActivityTopDepthHeightMeasure.MeasureUnitCode", # generated/required/replaces original
  "ActivityBottomDepthHeightMeasure.MeasureValue", # required
  "TADA.ActivityBottomDepthHeightMeasure.MeasureValue", # generated/required/replaces original
  "TADA.WQXConversionFactor.ActivityBottomDepthHeightMeasure",
  "TADA.ActivityBottomDepthHeightMeasure.MeasureValueDataTypes.Flag", # generated
  "ActivityBottomDepthHeightMeasure.MeasureUnitCode", # required
  "TADA.ActivityBottomDepthHeightMeasure.MeasureUnitCode", # generated/required/replaces original

  # Result Value Context
  "ResultTimeBasisText", # required
  "StatisticalBaseCode", # required
  "ResultFileUrl", # filter
  "TADA.ContinuousData.Flag", # generated, specifies if value is discrete or flags continuous data submitted by data provider (may be high frequency time series results or statistics)
  "TADA.ResultValueAggregation.Flag", # generated, specifies if value is discrete or a daily max, avg, min (used when TADA calculates these)
  "TADA.NutrientSummation.Flag", # generated
  "TADA.NutrientSummationGroup", # generated
  "TADA.NutrientSummationEquation", # generated

  # Sample/Measurement Collection/Analytical Method
  "ResultAnalyticalMethod.MethodName",
  "ResultAnalyticalMethod.MethodDescriptionText",
  "ResultAnalyticalMethod.MethodIdentifier",
  "ResultAnalyticalMethod.MethodIdentifierContext",
  "ResultAnalyticalMethod.MethodUrl",
  "TADA.AnalyticalMethod.Flag",
  "SampleCollectionMethod.MethodIdentifier", # required
  "SampleCollectionMethod.MethodIdentifierContext", # required
  "SampleCollectionMethod.MethodName", # required
  "SampleCollectionMethod.MethodDescriptionText", # required
  "SampleCollectionEquipmentName", # required

  # Result Quality
  "MeasureQualifierCode", # required, could be replaced by TADA.MeasureQualifierCode.Def in future mods
  "ResultStatusIdentifier",
  "TADA.MeasureQualifierCode.Flag", # generated
  "TADA.MeasureQualifierCode.Def", # generated, could replace MeasureQualifierCode in future mods
  "ResultCommentText",
  "ActivityCommentText", # required
  "HydrologicCondition", # filter, weather conditions
  "HydrologicEvent", # filter, weather conditions
  "DataQuality.PrecisionValue", # required
  "DataQuality.BiasValue", # required
  "DataQuality.ConfidenceIntervalValue", # required
  "DataQuality.UpperConfidenceLimitValue", # required
  "DataQuality.LowerConfidenceLimitValue", # required
  "SamplingDesignTypeCode",
  "LaboratoryName",
  "ResultLaboratoryCommentText",
  "ResultIdentifier", # required
  "ActivityIdentifier", # required

  # Organization (e.g. data submitter)
  "OrganizationIdentifier", # required
  "OrganizationFormalName", # required
  "TADA.MultipleOrgDuplicate",
  "TADA.MultipleOrgDupGroupID",
  "TADA.ResultSelectedMultipleOrgs",
  "TADA.SingleOrgDupGroupID",
  "TADA.SingleOrgDup.Flag",

  # Organization Projects
  "ProjectName", # required
  "ProjectDescriptionText",
  "ProjectIdentifier", # required
  "ProjectFileUrl", # required, may include QAPP doc
  "QAPPApprovedIndicator",
  "QAPPApprovalAgencyName",
  "TADA.QAPPDocAvailable", # generated, based on ProjectFileUrl

  # Organization Monitoring Locations
  "CountryCode",
  "StateCode",
  "CountyCode",
  "MonitoringLocationName", # required
  "TADA.MonitoringLocationName", # generated
  "MonitoringLocationTypeName",
  "TADA.MonitoringLocationTypeName", # generated
  "MonitoringLocationDescriptionText",
  "LatitudeMeasure",
  "TADA.LatitudeMeasure", # generated
  "LongitudeMeasure",
  "TADA.LongitudeMeasure", # generated
  "HorizontalCoordinateReferenceSystemDatumName",
  "TADA.SuspectCoordinates.Flag", # generated
  "HUCEightDigitCode",
  "MonitoringLocationIdentifier", # required
  "TADA.MonitoringLocationIdentifier",
  "TADA.NearbySites.Flag", # generated,
  "TADA.NearbySiteGroup", # generated

  # Groundwater fields, used for auto filtering for assessment use case but should not be required to have in TADA template
  "AquiferName", # filter, groundwater
  "AquiferTypeName", # filter
  "LocalAqfrName", # filter, groundwater
  "ConstructionDateText", # filter
  "WellDepthMeasure.MeasureValue", # filter
  "WellDepthMeasure.MeasureUnitCode", # filter
  "WellHoleDepthMeasure.MeasureValue", # filter
  "WellHoleDepthMeasure.MeasureUnitCode" # filter
)

# ordered list of non-essential WQP columns that can be removed from df
extra.cols <- c(
  # Others
  "ActivityDepthAltitudeReferencePointText",
  "ActivityEndDate",
  "ActivityEndTime.Time",
  "ActivityEndTime.TimeZoneCode",
  "ActivityEndDateTime", # originally generated by USGS DR, # no longer in default dataRetrieval profile? 11/7/24
  "ActivityConductingOrganizationText",
  "SampleAquifer",
  "ActivityLocation.LatitudeMeasure",
  "ActivityLocation.LongitudeMeasure",
  "ResultWeightBasisText",
  "ResultTemperatureBasisText",
  "ResultParticleSizeBasisText",
  "USGSPCode",
  "BinaryObjectFileName",
  "BinaryObjectFileTypeCode",
  "AnalysisStartDate",
  "ResultDetectionQuantitationLimitUrl",
  "LabSamplePreparationUrl",
  "timeZoneStart", # no longer in default dataRetrieval profile? 11/7/24
  "timeZoneEnd", # no longer in default dataRetrieval profile? 11/7/24
  "ActivityStartTime.TimeZoneCode_offset", # new column from default dataRetrieval profile? 11/7/24
  "ActivityEndTime.TimeZoneCode_offset", # new column from default dataRetrieval profile? 11/21/24
  "SourceMapScaleNumeric",
  "HorizontalAccuracyMeasure.MeasureValue",
  "HorizontalAccuracyMeasure.MeasureUnitCode",
  "HorizontalCollectionMethodName",
  "VerticalMeasure.MeasureValue",
  "VerticalMeasure.MeasureUnitCode",
  "VerticalAccuracyMeasure.MeasureValue",
  "VerticalAccuracyMeasure.MeasureUnitCode",
  "VerticalCollectionMethodName",
  "VerticalCoordinateReferenceSystemDatumName",
  "FormationTypeText",
  "ProjectMonitoringLocationWeightingUrl",
  "DrainageAreaMeasure.MeasureValue",
  "DrainageAreaMeasure.MeasureUnitCode",
  "ContributingDrainageAreaMeasure.MeasureValue",
  "ContributingDrainageAreaMeasure.MeasureUnitCode",
  "ProviderName",
  "LastUpdated"
)

attains.cols <- c(
  "ATTAINS.organizationid", "ATTAINS.submissionid", "ATTAINS.hasprotectionplan",
  "ATTAINS.assessmentunitname", "ATTAINS.nhdplusid", "ATTAINS.tas303d",
  "ATTAINS.isthreatened", "ATTAINS.state", "ATTAINS.on303dlist",
  "ATTAINS.organizationname", "ATTAINS.region", "ATTAINS.Shape_Length",
  "ATTAINS.reportingcycle", "ATTAINS.assmnt_joinkey", "ATTAINS.hastmdl",
  "ATTAINS.orgtype",
  "ATTAINS.permid_joinkey", #
  "ATTAINS.catchmentistribal",
  "ATTAINS.ircategory", "ATTAINS.waterbodyreportlink", "ATTAINS.assessmentunitidentifier",
  "ATTAINS.overallstatus", "ATTAINS.isassessed", "ATTAINS.isimpaired",
  "ATTAINS.has4bplan", "ATTAINS.huc12", "ATTAINS.hasalternativeplan",
  "ATTAINS.visionpriority303d", "ATTAINS.areasqkm", "ATTAINS.catchmentareasqkm",
  "ATTAINS.catchmentstatecode", "ATTAINS.catchmentresolution", "ATTAINS.Shape_Area",
  "ATTAINS.xwalk_method",
  "ATTAINS.xwalk_huc12_version",
  "ATTAINS.waterTypeCode"
)

# Only used in TADA Shiny or should be at the end
last.cols <- c(
  "TADA.Remove",
  "TADA.RemovalReason",
  "TADAShiny.tab",
  "geometry"
)


#' Order TADA Columns and Rows
#'
#' This utility function moves all TADA-created columns next to associated
#' original columns in the dataframe. The function also reorders all columns to
#' improve readability and usability, and orders the dataframe rows by ResultIdentifier.
#'
#' @param .data TADA dataframe
#'
#' @return A reordered TADA handled dataframe.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Find web service URLs for each Profile using WQP User Interface (https://www.waterqualitydata.us/)
#' # Example WQP URL: https://www.waterqualitydata.us/#statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&providers=NWIS&providers=STEWARDS&providers=STORET
#'
#' # Use TADA_ReadWQPWebServices to load the Station, Project, and Phys-Chem Result profiles
#' stationProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Station/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#' physchemProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Result/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&dataProfile=resultPhysChem&providers=NWIS&providers=STEWARDS&providers=STORET")
#' projectProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Project/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#'
#' # Join all three profiles using TADA_JoinWQPProfiles
#' TADAProfile <- TADA_JoinWQPProfiles(
#'   FullPhysChem = physchemProfile,
#'   Sites = stationProfile, Projects = projectProfile
#' )
#'
#' # Run TADA_OrderCols
#' Reordered_TADAProfile <- TADA_OrderCols(TADAProfile)
#' }
#'
TADA_OrderCols <- function(.data) {
  required_cols <- require.cols[require.cols %in% names(.data)]

  extra_cols <- extra.cols[extra.cols %in% names(.data)]

  last_cols <- last.cols[last.cols %in% names(.data)]

  attains_cols <- attains.cols[attains.cols %in% names(.data)]

  rearranged <- .data %>%
    dplyr::relocate(tidyselect::any_of(required_cols)) %>%
    dplyr::relocate(tidyselect::any_of(extra_cols), .after = tidyselect::any_of(required_cols)) %>%
    dplyr::relocate(tidyselect::any_of(last_cols), .after = tidyselect::any_of(extra_cols)) %>%
    dplyr::relocate(tidyselect::any_of(attains_cols), .after = tidyselect::any_of(last_cols))

  rearranged <- rearranged[order(rearranged$ResultIdentifier), ]

  return(rearranged)
}



#' Get TADA Template
#'
#' This function returns a blank TADA template that can be used as a starting
#' point to reformat your own custom data set into the TADA format.
#'
#' @return A TADA template with all required columns for the TADA workflow.
#'
#' @export
#'
#' @examples
#' TADA_Template <- TADA_GetTemplate()
#'
TADA_GetTemplate <- function() {
  # remove names with TADA. string from require.cols
  template_cols <- require.cols
  template_cols <- Filter(function(x) !any(grepl("TADA.", x)), template_cols)
  templatedata <- data.frame()
  templatedata <- data.frame(matrix(nrow = 0, ncol = length(template_cols)))
  colnames(templatedata) <- template_cols
  return(templatedata)
}



#' TADA Required Fields Check
#'
#' This function checks if all fields required to run TADA functions are included in the input
#' dataframe. It is used in the TADA Shiny application to test user supplied files for compatibility
#' with the application.
#'
#' @param .data A dataframe
#'
#' @return Boolean result, TRUE or FALSE, indicating whether or not the input dataframe contains all
#' of the required fields. If FALSE, an error will be returned that includes the names of all
#' missing columns.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Find web service URLs for each Profile using WQP User Interface (https://www.waterqualitydata.us/)
#' # Example WQP URL: https://www.waterqualitydata.us/#statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&providers=NWIS&providers=STEWARDS&providers=STORET
#'
#' # Use TADA_ReadWQPWebServices to load the Station, Project, and Phys-Chem Result profiles
#' stationProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Station/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#' physchemProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Result/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&dataProfile=resultPhysChem&providers=NWIS&providers=STEWARDS&providers=STORET")
#' projectProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Project/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#'
#' # Join all three profiles using TADA_JoinWQPProfiles
#' TADAProfile <- TADA_JoinWQPProfiles(
#'   FullPhysChem = physchemProfile, Sites = stationProfile,
#'   Projects = projectProfile
#' )
#'
#' # Run TADA_CheckRequiredFields, returns error message,
#' # 'The dataframe does not contain the required fields: ActivityStartDateTime'
#' TADA_CheckRequiredFields(TADAProfile)
#'
#' # Add missing col
#' TADAProfile1 <- TADA_CreateDateTime(
#'   .data = TADAProfile,
#'   date_col = "ActivityStartDate",
#'   time_col = "ActivityStartTime.Time",
#'   tz_col = "ActivityStartTime.TimeZoneCode",
#'   tz = "UTC"
#' )
#'
#' review_TADAProfile1 <- TADAProfile1 %>% dplyr::select(c(
#'   "ActivityStartDate",
#'   "ActivityStartTime.Time",
#'   "ActivityStartTime.TimeZoneCode",
#'   "ActivityStartDateTime",
#'   "ActivityStartTime.TimeZoneCode_offset"
#' ))
#'
#' # re-run TADA_CheckRequiredFields, returns TRUE
#' TADA_CheckRequiredFields(TADAProfile1)
#' }
#'
TADA_CheckRequiredFields <- function(.data) {
  # remove names with TADA. string from require.cols
  require.originals <- Filter(function(x) !any(grepl("TADA.", x)), require.cols)

  if (("data.frame" %in% class(.data)) == FALSE) {
    stop("Input object must be of class 'data.frame'")
  }

  if (all(require.originals %in% colnames(.data)) == TRUE) {
    TRUE
  } else {
    missingcols <- base::setdiff(require.originals, colnames(.data))
    stop(
      "TADA_CheckRequiredFields: the dataframe does not contain the required fields: ",
      paste(as.character(missingcols),
        collapse = ", "
      )
    )
  }
}



#' TADA_RetainRequired
#'
#' This function removes all duplicate columns where TADA has created a new column with a TADA prefix.
#' It retains all TADA prefixed columns as well as other original fields that are either required by
#' other TADA functions or are commonly used filters. Using this function allows the user to accept
#' all TADA created changes and reduce the size of the data set before using TADA mapping or data
#' visualization features in the TADA package or Shiny app.
#'
#' @param .data A dataframe
#'
#' @return A dataframe containing all required fields for use with TADA as well as fields
#' commonly used for filtering.
#'
#' @export
#'
#' @examples
#' data(Data_Nutrients_UT)
#' reducedcols_Data_Nutrients_UT <- TADA_RetainRequired(Data_Nutrients_UT)
#'
TADA_RetainRequired <- function(.data) {
  # check .data is data.frame
  TADA_CheckType(.data, "data.frame", "Input object")

  # execute function after TADA_CheckType passes
  print("TADA_RetainRequired: removing columns not required for TADA workflow including original columns that have been replaced with TADA prefix duplicates.")

  # Create list of all columns to be retained
  keep.cols <- c(require.cols, attains.cols, last.cols)

  # create list of all columns in original data set
  original.cols <- .data %>% names()

  # create a list of columns that were removed by comparing original column and keep column lists
  remove.cols <- setdiff(original.cols, keep.cols)

  # create a character string listing all removed columns
  remove.paste <- stringi::stri_replace_last_fixed(paste(as.character(remove.cols), collapse = ", ", sep = ""), ", ", " and ")

  # retain only columns identified as required or for filtering in the dataframe
  .data <- .data %>%
    dplyr::select(dplyr::contains(keep.cols))

  # print a message to list names for all removed columns
  print(paste("The following non-required columns were removed: ", remove.paste, ".", sep = ""))

  return(.data)

  # remove intermediate objects
  rm(keep.cols, original.cols, remove.cols, remove.paste)
}
USEPA/TADA documentation built on April 12, 2025, 1:47 p.m.