R/database.R

Defines functions connect_cyan generate_location_index generate_parameter_index get_cyan_data get_bivariate add_GMT_time add_solar_noon add_trophic_status add_WHO_category add_EPA_recreational_threshold

Documented in add_EPA_recreational_threshold add_GMT_time add_solar_noon add_trophic_status add_WHO_category connect_cyan generate_location_index generate_parameter_index get_bivariate get_cyan_data

#' Connect to the CyAN database
#'
#' Generate a connection to the CyAN database or another sqlite database using
#' the CyAN schema and parameter table
#'
#' @param path a character string giving the path to the database
#'
#' @return a SQLite database connection
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' @export

connect_cyan <- function(path) {

  if(!file.exists(path)) {
    stop("file path doesn't exist, or isn't available")
  }

  cyan_connection <- DBI::dbConnect(RSQLite::SQLite(), dbname = path)
  table_names <- DBI::dbListTables(cyan_connection)

  if(!(all(c("ACTIVITY", "LOCATION", "RESULT", "PARAMETER", "METHOD") %in% table_names))) {
    stop("not a valid CyAN database")
  }

  return(cyan_connection)

}

#' Generate an index of locations and parameters from the database
#'
#' Based on the CyAN database connection, generate an index of all unique combinations
#' of locations and the parameters that have been collected there
#'
#' @param cyan_connection a CyAN database connection from \code{connect_cyan()}
#'
#' @return a data frame of all unique combinations of location and parameter
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#' #Generate the location index
#' location_index <- generate_location_index(db_connection)
#'
#' @export
#'

generate_location_index <- function(cyan_connection) {

  LOCATION_ID <- LOCATION_NAME <- LATITUDE <- LONGITUDE <-
    ACTIVITY_ID <- PARAMETER_ID <- ".dplyr.var"

  location <- dplyr::tbl(cyan_connection, "LOCATION")
  location <- dplyr::select(location, LOCATION_ID, LOCATION_NAME, LATITUDE, LONGITUDE)
  activity <- dplyr::tbl(cyan_connection, "ACTIVITY")
  activity <- dplyr::select(activity, LOCATION_ID, ACTIVITY_ID)
  result <- dplyr::tbl(cyan_connection, "RESULT")
  result <- dplyr::select(result, ACTIVITY_ID, PARAMETER_ID)

  location_index <- dplyr::inner_join(location, activity, by = "LOCATION_ID")
  location_index <- dplyr::inner_join(location_index, result, by = "ACTIVITY_ID")
  location_index <- dplyr::select(location_index, LOCATION_ID, PARAMETER_ID)
  location_index <- dplyr::distinct(location_index)
  location_index <- dplyr::left_join(location_index, location, by = "LOCATION_ID")
  location_index <- dplyr::select(location_index, LOCATION_NAME, LATITUDE, LONGITUDE, PARAMETER_ID)
  location_index <- dplyr::collect(location_index)

  return(location_index)

}

#' Generate a table of parameter identifiers and their short names
#'
#' @param cyan_connection a CyAN database connection from \code{connect_cyan()}
#' @param has_data a logical indicating whether to limit the parameter list to only
#' those parameters that have data populated in the database
#'
#' @return a data frame of parameter identifiers and their associated
#' short names
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#' #Generate the parameter index
#' parameter_index <- generate_parameter_index(db_connection)
#' #Limit the index to the parameters that have data
#' parameter_index_hasdata <- generate_parameter_index(db_connection, has_data = TRUE)
#'
#' @export

generate_parameter_index <- function(cyan_connection, has_data = FALSE) {

  PARAMETER_ID <- SHORT_NAME <- ".dplyr.var"

  parameter_list <- dplyr::tbl(cyan_connection, "PARAMETER")
  parameter_list <- dplyr::select(parameter_list, PARAMETER_ID, SHORT_NAME)
  parameter_list <- dplyr::collect(parameter_list)

  if(has_data) {
    parameter_index <- dplyr::tbl(cyan_connection, "RESULT")
    parameter_index <- dplyr::select(parameter_index, PARAMETER_ID)
    parameter_index <- dplyr::distinct(parameter_index)
    parameter_index <- dplyr::collect(parameter_index)
    parameter_index <- inner_join(parameter_index, parameter_list, by = "PARAMETER_ID")
  } else {
    parameter_index <- parameter_list
  }

  return(parameter_index)

}

#' Get data from CyAN
#'
#' Basic data query from the CyAN database based on a number of parameters,
#' used to drive the data download from the map screen of the shiny app. Can also
#' be used with the plotting functions
#'
#' @param cyan_connection a CyAN database connection from \code{connect_cyan}
#' @param collect a logical indicating whether the query will be pulled into
#' a local tibble using dbplyr::collect. If you are planning on doing further
#' selection or filtering using dplyr, you may want not want to collect until
#' you're ready
#' @param north_latitude,south_latitude numerics indicating the northern most
#' and sothern most latitude that will be included in the query. Latitude
#' should be given as a positive number of degrees north of the equator.
#'
#' @param west_longitude,east_longitude numerics indicating the western most
#' and eastern most latitude that will be included in the query. Longitude
#' should be given as a negative number of decimal degrees west of the prime
#' meridian.
#' @param years numeric vector of years that will be included in the query
#' @param parameters a character vector of parameter names that will be
#' returned in the query
#' @param minimum_tier a number between 1.0 and 4.0 indicating the minimum
#' data tier that will be returned in the query
#' @param states a character vector of the postal codes for the states to be
#' included in the query. Usually, this would not be used if querying by
#' latitude and longitude.
#' @return if collect is FALSE, the query will be generated, but not collected.
#' See the documentation on \code{collect()} for details. Otherwise, if collect
#' is TRUE, the query will be pulled into memory and returned as a tibble.
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the chlorophyll and chlorophyll-a data (parameter id P0051 & P0054)
#' #or the state of Kansas in the year 2016
#' ks_chl_2016 <- get_cyan_data(db_connection,
#'                              years = 2016,
#'                              parameters = c("P0051", "P0054"),
#'                              states = "KS")
#'
#' #Get chlorophyll-a data within a latitude longitude bounding box. If you
#' #need to reference the parameter table to find parameter codes, you can use
#' #generate_parameter_index()
#' chla_2017 <- get_cyan_data(db_connection,
#'                            north_latitude = 37.818, south_latitude = 37.714,
#'                            west_longitude = -98.028, east_longitude = -97.735,
#'                            years = 2017:2018,
#'                            parameters = "P0051")
#'
#' @export

get_cyan_data <- function(cyan_connection, collect = FALSE,
                          north_latitude = NULL, south_latitude = NULL,
                          west_longitude = NULL, east_longitude = NULL,
                          years = NULL,
                          parameters = NULL,
                          minimum_tier = NULL,
                          states  = NULL) {

  LOCATION_ID <- LOCATION_NAME <- LATITUDE <- LONGITUDE <- DATUM <- STATE_CODE <-
    HUC <- START_DATE <- START_TIME <- END_DATE <- END_TIME <- TZ <-
    COLLECTION_METHOD_ID <- SAMPLE_TYPE_CODE <- ACTIVITY_DEPTH <-
    ACTIVITY_DEPTH_UNIT <- ACTIVITY_TOP_DEPTH <- ACTIVITY_TOP_DEPTH_UNIT <-
    ACTIVITY_BOTTOM_DEPTH <- ACTIVITY_BOTTOM_DEPTH_UNIT <-
    ACTIVITY_DEPTH_REFERENCE <- ACTIVITY_COMMENT <- RESULT_ID <- ACTIVITY_ID <-
    PARAMETER_ID <- METHOD_ID <- RESULT_DEPTH <- RESULT_DEPTH_UNIT <-
    RESULT_DEPTH_REFERENCE <- QUALIFIER <- RESULT_VALUE <- DETECTION_LIMIT_VALUE <-
    DETECTION_LIMIT_UNIT <- DETECTION_LIMIT_TYPE <- TIER <- CROSSWALK_ID <-
    PARAMETER_NAME <- UNITS <- WQP_METHOD_IDENTIFIER <- WQP_METHOD_CONTEXT <-
    WQP_METHOD_NAME <- WQP_METHOD_DESCRIPTION <- WQP_COLLECTION_METHOD_ID <-
    WQP_COLLECTION_METHOD_CONTEXT <- WQP_COLLECTION_METHOD_NAME <- LOCATION_TYPE <-
    LOCAL_TZ <- STRFTIME <- SAMPLE_TYPE_DEFINITION <- YEAR <- WQP_ACTIVITY_ID <- ".dplyr.var"

  location <- dplyr::tbl(cyan_connection, "LOCATION")
  location <- dplyr::select(location,
                            LOCATION_ID, LOCATION_TYPE, LATITUDE, LONGITUDE, DATUM, STATE_CODE, HUC,
                            LOCATION_NAME, LOCAL_TZ)

  activity <- dplyr::tbl(cyan_connection, "ACTIVITY")
  activity <- dplyr::select(activity,
                            ACTIVITY_ID, WQP_ACTIVITY_ID, START_DATE, START_TIME, END_DATE, END_TIME, TZ,
                            LOCATION_ID, COLLECTION_METHOD_ID, SAMPLE_TYPE_CODE, ACTIVITY_DEPTH,
                            ACTIVITY_DEPTH_UNIT, ACTIVITY_TOP_DEPTH, ACTIVITY_TOP_DEPTH_UNIT,
                            ACTIVITY_BOTTOM_DEPTH, ACTIVITY_BOTTOM_DEPTH_UNIT,
                            ACTIVITY_DEPTH_REFERENCE, ACTIVITY_COMMENT)
  activity <- dplyr::mutate(activity,
                            YEAR = as.numeric(STRFTIME('%Y', START_DATE)))

  result <- dplyr::tbl(cyan_connection, "RESULT")
  result <- dplyr::select(result,
                          RESULT_ID, ACTIVITY_ID, PARAMETER_ID, METHOD_ID, RESULT_DEPTH,
                          RESULT_DEPTH_UNIT, RESULT_DEPTH_REFERENCE, QUALIFIER,
                          RESULT_VALUE, DETECTION_LIMIT_VALUE, DETECTION_LIMIT_UNIT,
                          DETECTION_LIMIT_TYPE, CROSSWALK_ID)

  parameter <- dplyr::tbl(cyan_connection, "PARAMETER")
  parameter <- dplyr::select(parameter, PARAMETER_ID, PARAMETER_NAME, UNITS)

  method <- dplyr::tbl(cyan_connection, "METHOD")
  method <- dplyr::select(method,
                          METHOD_ID, WQP_METHOD_IDENTIFIER, WQP_METHOD_CONTEXT, WQP_METHOD_NAME,
                          WQP_METHOD_DESCRIPTION)

  sample_type <- dplyr::tbl(cyan_connection, "SAMPLE_TYPE")
  sample_type <- dplyr::select(sample_type, SAMPLE_TYPE_CODE, SAMPLE_TYPE_DEFINITION)

  collection_method <- dplyr::tbl(cyan_connection, "COLLECTION_METHOD")
  collection_method <- dplyr::select(collection_method,
                                     COLLECTION_METHOD_ID, WQP_COLLECTION_METHOD_ID,
                                     WQP_COLLECTION_METHOD_CONTEXT, WQP_COLLECTION_METHOD_NAME)

  if(!(is.null(north_latitude))) {

    if(!is.numeric(north_latitude)) {
      stop("north_latitude should be a number, or left as NULL")
    }
    if(north_latitude < 24.54409) {
      warning("given north_latitude is south of the contiguous 48 states")
    }

    location <- dplyr::filter(location, LATITUDE <= north_latitude)

  }
  if(!(is.null(south_latitude))) {

    if(!is.numeric(south_latitude)) {
      stop("south_latitude should be a number, or left as NULL")
    }
    if(south_latitude > 49.384472) {
      warning("given south_latitude is north of the contiguous 48 states")
    }

    location <- dplyr::filter(location, LATITUDE >= south_latitude)

  }
  if(!(is.null(west_longitude))) {

    if(!is.numeric(west_longitude)) {
      stop("west_longitude should be a number, or left as NULL")
    }
    if(west_longitude > -66.949778) {
      warning("given west_longitude is east of the contiguous 48 states")
    }

    location <- dplyr::filter(location, LONGITUDE >= west_longitude)

  }
  if(!(is.null(east_longitude))) {

    if(!is.numeric(east_longitude)) {
      stop("east_longitude should be a number, or left as NULL")
    }
    if(east_longitude < -124.667222) {
      warning("given east_longitude is west of the contiguous 48 states")
    }

    location <- dplyr::filter(location, LONGITUDE <= east_longitude)

  }
  if(!(is.null(years))) {

    if(!is.numeric(years)) {
      stop("years should be a vector of integers")
    }

    activity <- dplyr::filter(activity, YEAR %in% years)

  }
  if(!(is.null(parameters))) {
    valid_parms <- dplyr::pull(parameter, PARAMETER_ID)
    if(!all(parameters %in% valid_parms)) {
      invalid <- paste(parameters[!(parameters %in% valid_parms)], collapse = " ")
      stop(paste(invalid, "not valid parameter_ids"))
    }
    result <- dplyr::filter(result, PARAMETER_ID %in% parameters)
  }
  if(!(is.null(minimum_tier))) {
    if(!is.numeric(minimum_tier)) {
      stop("minimum tier should be a number between 5.0 and 1.0")
    }
    if(minimum_tier < 1.0) {
      stop("minimum tier number is 1.0")
    }
    result <- dplyr::filter(result, TIER >= minimum_tier)
  }
  if(!(is.null(states))) {
    if(!all(states %in% datasets::state.abb)) {
      invalid_states <- states[!(states %in% datasets::state.abb)]
      stop(paste(invalid_states, "not valid state abbreviations"))
    }
    location <- dplyr::filter(location, STATE_CODE %in% states)
  }

  if(collect) {
    message("Collecting output")
    location <- dplyr::collect(location)
    activity <- dplyr::collect(activity)
    result <- dplyr::collect(result)
    parameter <- dplyr::collect(parameter)
    method <- dplyr::collect(method)
    sample_type <- dplyr::collect(sample_type)
    collection_method <- dplyr::collect(collection_method)
  }

  output <- dplyr::inner_join(location, activity,
                              by = "LOCATION_ID", suffix = c(".LOCATION", ".ACTIVITY"))
  output <- dplyr::inner_join(output, result,
                              by = "ACTIVITY_ID", suffix = c(".ACTIVITY", ".RESULT"))
  output <- dplyr::inner_join(output, parameter,
                              by = "PARAMETER_ID", suffix = c(".RESULT", ".PARAMETER"))
  output <- dplyr::inner_join(output, method,
                              by = "METHOD_ID", suffix = c(".PARAMETER", ".METHOD"))
  output <- dplyr::inner_join(output, sample_type,
                              by = "SAMPLE_TYPE_CODE",
                              suffix = c(".METHOD", ".SAMPLE_TYPE"))
  output <- dplyr::inner_join(output, collection_method,
                              by = "COLLECTION_METHOD_ID",
                              suffix = c(".SAMPLE_TYPE", "COLLECTION_METHOD"))

  return(output)

}


#' Find simaltaneous measurements of two parameters
#'
#' Find sampling activities that have associated measurements of two parameters
#' of interest - helpful for bivariate plotting/ investigating the relationship
#' between two parameters.
#'
#' @param cyan_connection a CyAN database connection from \code{connect_cyan}
#' @param parameter_1,parameter_2 CyAN parameter codes for the two parameters
#' of interest
#' @param north_latitude,south_latitude numbers indicating the northern most
#' and sothern most latitude that will be included in the query. Latitude
#' should be given as a positive number of degrees north of the equator.
#' @param west_longitude,east_longitude nunbers indicating the western most
#' and eastern most latitude that will be included in the query. Longitude
#' should be given as a negative number of decimal degrees west of the prime
#' meridian.
#' @param years numeric vector of years that will be included in the query
#' @param states a character vector of the postal codes for the states to be
#' included in the query. Usually, this would not be used if querying by
#' latitude and longitude.
#'
#' @return if collect is FALSE, the query will be generated, but not collected.
#' See the documentation on \code{collect} for details. Otherwise, if collect
#' is TRUE, the query will be pulled into memory and returned as a tibble.
#' Returns a wide data frame, with a shared columns related to the activity,
#' and columns related to the analytical result suffixed with a ".1" for parameter
#' 1 and ".2" for parameter 2.
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get simaltaneous observations of secchi depth (P0002) and chlorophyll-a (P0051)
#' biv_secchi_chla <- get_bivariate(db_connection,
#'                                  parameter_1 = "P0002", parameter_2 = "P0051",
#'                                  north_latitude = 37.818, south_latitude = 37.714,
#'                                  west_longitude = -98.028, east_longitude = -97.735)
#'
#' @export

get_bivariate <- function(cyan_connection, parameter_1, parameter_2,
                          north_latitude = NULL, south_latitude = NULL,
                          west_longitude = NULL, east_longitude = NULL,
                          years = NULL, states = NULL) {

  PARAMETER_ID <- ".dplyr.var"

  all_data <- get_cyan_data(cyan_connection, north_latitude = north_latitude,
                            south_latitude = south_latitude,
                            east_longitude = east_longitude,
                            west_longitude = west_longitude,
                            years = years,
                            states = states,
                            parameters = c(parameter_1, parameter_2),
                            collect = TRUE)

  parameter_1_data <- dplyr::filter(all_data, PARAMETER_ID == parameter_1)
  parameter_2_data <- dplyr::filter(all_data, PARAMETER_ID == parameter_2)

  join_by <- names(all_data)[!(names(all_data) %in%
                                 c("RESULT_ID","PARAMETER_ID","METHOD_ID",
                                   "RESULT_DEPTH","RESULT_DEPTH_UNIT","RESULT_DEPTH_REFERENCE",
                                   "QUALIFIER","RESULT_VALUE","DETECTION_LIMIT_VALUE",
                                   "DETECTION_LIMIT_UNIT","DETECTION_LIMIT_TYPE","CROSSWALK_ID",
                                   "PARAMETER_NAME","UNITS","WQP_METHOD_IDENTIFIER",
                                   "WQP_METHOD_CONTEXT","WQP_METHOD_NAME","WQP_METHOD_DESCRIPTION",
                                   "SAMPLE_TYPE_DEFINITION","WQP_COLLECTION_METHOD_ID",
                                   "WQP_COLLECTION_METHOD_CONTEXT",
                                   "WQP_COLLECTION_METHOD_NAME"))]

  plot_data <- dplyr::inner_join(parameter_1_data, parameter_2_data,
                                 by = join_by,
                                 suffix = c(".1", ".2"))

  return(plot_data)

}

#' Add a column of GMT time to a cyan data query
#'
#' Give a column of GMT time based on the activity start time and local time zone
#'
#' @param cyan_data a data frame from \code{get_cyan_data()} with
#' \code{collect = TRUE}
#'
#' @return cyan_data with an additional character column for time in GMT
#'
#' @examples
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the chlorophyll and chlorophyll-a data (parameter id P0051 & P0054)
#' #or the state of Kansas in the year 2016
#' ks_chl_2016 <- get_cyan_data(db_connection, collect = TRUE,
#'                              years = 2016,
#'                              parameters = c("P0051", "P0054"),
#'                              states = "KS")
#'
#' #Add the column for GMT time
#' ks_chl_2016_wGMT <- add_GMT_time(ks_chl_2016)
#'
#'
#' @export
#'

add_GMT_time <- function(cyan_data) {

  START_DATE <- START_TIME <- datetime <- ".dplyr.var"

  output <- dplyr::mutate(cyan_data, datetime = paste(START_DATE, START_TIME))

  timezone <- output$TZ
  state <- output$STATE_CODE
  timezone[timezone %in% c("EST", "EDT")] <- "America/New_York"
  timezone[timezone %in% c("CST", "CDT")] <- "America/Chicago"
  timezone[timezone == "MST" & state == "AZ"] <- "America/Phoenix"
  timezone[timezone %in% c("MST", "MDT")] <- "America/Denver"
  timezone[timezone %in% c("PST", "PDT")] <- "America/Los_Angeles"

  gmt_time <- vector(length = nrow(output))
  for(i in 1:nrow(output)) {
    gmt_time[i] <- as.character(lubridate::ymd_hms(output$datetime[i],
                                                   tz = timezone[i]), tz = "GMT",
                                format = "%Y-%m-%d %H:%M:%S %Z")
  }

  gmt_time[output$TZ == ""] <- NA
  output$TIME_GMT <- gmt_time
  output <- dplyr::select(output, -datetime)
  return(output)

}

#' Add a logical column indicating whether the sample was taken during solar noon
#'
#' Solar noon is defined as 1000 - 1400, extended solar noon is defined as
#' 0900 - 1500
#'
#' @param cyan_data a data frame from \code{get_cyan_data()} with
#' \code{collect = TRUE}
#'
#' @return cyan_data with two additional logical columns, solar_noon indicating
#' if the sample was taken during solar noon and ext_solar_noon indicating if
#' the sample was taken during extended solar noon.
#'
#' @examples
#'
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the chlorophyll and chlorophyll-a data (parameter id P0051 & P0054)
#' #or the state of Kansas in the year 2016
#' ks_chl_2016 <- get_cyan_data(db_connection, collect = TRUE,
#'                              years = 2016,
#'                              parameters = c("P0051", "P0054"),
#'                              states = "KS",)
#'
#' #Add logical columns for solar noon and extended solar noon
#' ks_chl_2016_wsolarnoon <- add_solar_noon(ks_chl_2016)
#'
#' @export

add_solar_noon <- function(cyan_data) {

  START_DATE <- START_TIME <- datetime <- ".dplyr.var"

  output <- dplyr::mutate(cyan_data, datetime = paste(START_DATE, START_TIME))

  timezone <- output$TZ
  state <- output$STATE_CODE
  timezone[timezone %in% c("EST", "EDT")] <- "America/New_York"
  timezone[timezone %in% c("CST", "CDT")] <- "America/Chicago"
  timezone[timezone == "MST" & state == "AZ"] <- "America/Phoenix"
  timezone[timezone %in% c("MST", "MDT")] <- "America/Denver"
  timezone[timezone %in% c("PST", "PDT")] <- "America/Los_Angeles"

  local_tz <- output$LOCAL_TZ

  hour <- vector(length = nrow(output))
  for(i in 1:nrow(cyan_data)) {
    hour[i] <- as.character(lubridate::ymd_hms(output$datetime[i],
                                               tz = timezone[i]),
                            tz = local_tz[i], format = "%H")
  }
  hour <- as.numeric(hour)
  solar_noon <- hour %in% 10:13
  ext_solar_noon <- hour %in% 9:14
  solar_noon[output$TZ == "" | output$LOCAL_TZ == ""] <- NA
  ext_solar_noon[output$TZ == "" | output$LOCAL_TZ == ""] <- NA

  output$solar_noon <- solar_noon
  output$ext_solar_noon <- ext_solar_noon

  output <- dplyr::select(output, -datetime)
  return(output)

}

#' Add columns for trophic status
#'
#' Add three columns for trophic status, where it is possible. Trophic
#' status can be calculated from secchi depth, chlorophyll-a, or
#' total phosphorus
#'
#' @param cyan_data a data frame from \code{get_cyan_data} with
#' \code{collect = TRUE}
#'
#' @return cyan_data with three additional columns. TROPHIC_STATUS_INDEX
#'  giving the numerical trophic status index,
#'  TROPHIC_STATUS - indicating which
#' trophic category that index falls into, and the method used to
#' calculate the trophic stats. TROPHIC_STATUS_METHOD
#'
#' @examples
#'
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the chlorophyll and chlorophyll-a data (parameter id P0051 & P0054)
#' #or the state of Kansas in the year 2016
#' ks_chl_2016 <- get_cyan_data(db_connection, collect = TRUE,
#'                              years = 2016,
#'                              parameters = c("P0051", "P0054"),
#'                              states = "KS")
#'
#' #Add trophic status columns
#' ks_chl_2016_wtrophic <- add_trophic_status(ks_chl_2016)
#'
#' @export

add_trophic_status <- function(cyan_data) {

  #Add columns
  cyan_data$TROPHIC_STATUS_INDEX <- NA
  cyan_data$TROPHIC_STATUS_METHOD <- NA
  cyan_data$TROPHIC_STATUS <- NA

  #Calculate trophic status based on secchi depth
  cyan_data$TROPHIC_STATUS_INDEX[cyan_data$PARAMETER_ID == "P0002"] <-
    60 - (14.41 * log(cyan_data$RESULT_VALUE[cyan_data$PARAMETER_ID == "P0002"]))
  cyan_data$TROPHIC_STATUS_METHOD[cyan_data$PARAMETER_ID == "P0002"] <- "SD"

  #Calculate trophic status based on chlorophyll-a
  cyan_data$TROPHIC_STATUS_INDEX[cyan_data$PARAMETER_ID == "P0051"] <-
    30.6 + (9.81 * log(cyan_data$RESULT_VALUE[cyan_data$PARAMETER_ID == "P0051"]))
  cyan_data$TROPHIC_STATUS_METHOD[cyan_data$PARAMETER_ID == "P0051"] <- "CHL-A"

  #Calculate trophic status based on total phosphorus
  cyan_data$TROPHIC_STATUS_INDEX[cyan_data$PARAMETER_ID == "P0031"] <-
    4.15 + (14.42 * log(cyan_data$RESULT_VALUE[cyan_data$PARAMETER_ID == "P0031"] * 1000))
  cyan_data$TROPHIC_STATUS_METHOD[cyan_data$PARAMETER_ID == "P0031"] <- "TP"

  #Assign trophic status based on the index
  cyan_data$TROPHIC_STATUS[cyan_data$TROPHIC_STATUS_INDEX >= 0 &
                             cyan_data$TROPHIC_STATUS_INDEX < 20] <- "Ultra-oligotrophy"
  cyan_data$TROPHIC_STATUS[cyan_data$TROPHIC_STATUS_INDEX >= 20 &
                             cyan_data$TROPHIC_STATUS_INDEX < 40] <- "Oligotrophy"
  cyan_data$TROPHIC_STATUS[cyan_data$TROPHIC_STATUS_INDEX >= 40 &
                             cyan_data$TROPHIC_STATUS_INDEX < 50] <- "Mesotrophy"
  cyan_data$TROPHIC_STATUS[cyan_data$TROPHIC_STATUS_INDEX >= 50 &
                             cyan_data$TROPHIC_STATUS_INDEX < 70] <- "Eutrophy"
  cyan_data$TROPHIC_STATUS[cyan_data$TROPHIC_STATUS_INDEX >= 70] <- "Hypereutrophy"

  return(cyan_data)

}

#' Add WHO thresholds
#'
#' Add columns indicating the WHO category for Cyanobacteria, Microcystin-LR, or
#' Cholorophyll-a, if applicable
#'
#' @param cyan_data a data frame from \code{get_cyan_data} with
#' \code{collect = TRUE}
#'
#' @return cyan_data with an additional column WHO_CATEGORY indicating whether
#' this result is classified as Low, Moderate, High, or Very High based on
#' WHO thresholds for the parameter
#'
#' @examples
#'
#' #' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the chlorophyll and chlorophyll-a data (parameter id P0051 & P0054)
#' #or the state of Kansas in the year 2016
#' ks_chl_2016 <- get_cyan_data(db_connection, collect = TRUE,
#'                              years = 2016,
#'                              parameters = c("P0051", "P0054"),
#'                              states = "KS")
#'
#' #Add any applicable WHO categories
#' ks_chl_2016_wWHO <- add_WHO_category(ks_chl_2016)
#'
#' @export
#'

add_WHO_category <- function(cyan_data) {

  category <- rep(c("Low", "Moderate", "High", "Very High"), 3)
  lower_bound <- c(0, 20000, 100000, 10000000,
                   0, 10, 20, 2000,
                   0, 10, 50, 5000)
  upper_bound <- c(20000, 100000, 10000000, Inf,
                   10, 20, 2000, Inf,
                   10, 50, 5000, Inf)
  parameter <- c(rep("P0023", 4), rep("P0063", 4), rep("P0051", 4))
  who_cat <- data.frame(category, lower_bound, upper_bound, parameter, stringsAsFactors = FALSE)

  parm <- cyan_data$PARAMETER_ID
  value <- cyan_data$RESULT_VALUE
  cat <- vector(mode = "numeric", length = length(parm))

  for(i in seq_along(parm)) {
    cat_i <- dplyr::filter(who_cat, parameter == parm[i], lower_bound <= value[i], upper_bound > value[i])
    cat_i <- dplyr::pull(cat_i, category)
    if(length(cat_i) != 1)
      cat_i <- NA
    cat[i] <- cat_i
  }

  cyan_data$WHO_CATEGORY <- cat
  return(cyan_data)

}

#' Add EPA recreational threshold
#'
#' Add a column indicating whether the EPA recreational trehsold was exceeded
#'
#' @param cyan_data a data frame from \code{get_cyan_data()} with
#' \code{collect = TRUE}
#'
#' @return cyan_data with an additional column EPA_REC_EXCEEDED - a logical column
#' indicating whether the applicable EPA recreational treshold was exceeded
#'
#' @examples
#'
#' #Connect to the example database bundled with CyAN
#' path <- system.file("extdata", "example.db", package = "CyAN")
#' db_connection <- connect_cyan(path)
#'
#' #Get all of the data for microcystis and cylindrospermopsin
#' ks_toxins <- get_cyan_data(db_connection, collect = TRUE,
#'                              parameters = c("P0073", "P0074"),
#'                              states = "KS")
#'
#' #Add any applicable recreational for toxins
#' ks_toxins_wEPA <- add_EPA_recreational_threshold(ks_toxins)
#'
#' @export
#'

add_EPA_recreational_threshold <- function(cyan_data) {

  cyan_data$EPA_REC_EXCEEDED <- NA

  cyan_data$EPA_REC_EXCEEDED[cyan_data$PARAMETER_ID == "P0073" &
                             cyan_data$RESULT_VALUE >= 8] <- TRUE
  cyan_data$EPA_REC_EXCEEDED[cyan_data$PARAMETER_ID == "P0073" &
                             cyan_data$RESULT_VALUE < 8] <- FALSE

  cyan_data$EPA_REC_EXCEEDED[cyan_data$PARAMETER_ID %in% c("P0074", "P0077") &
                               cyan_data$RESULT_VALUE >= 15] <- TRUE
  cyan_data$EPA_REC_EXCEEDED[cyan_data$PARAMETER_ID %in% c("P0074", "P0077") &
                               cyan_data$RESULT_VALUE < 15] <- FALSE

  return(cyan_data)

}
PatrickEslick/CyAN documentation built on Oct. 2, 2019, 5:50 p.m.