R/RIVPACS.R

Defines functions SoCalRivpacs RIVPACS

Documented in RIVPACS

#' CRiver Invertebrate Prediction and Classification System (RIVPACS) Index and RIVPACS Condition Category
#' (SoCal only)
#'
#' @description
#'   For more information concerning RIVPACS, consult the CASQO Technical Manual page 80
#'
#' @param benthic_data data frame stored in the R environment. Note that this data frame MUST contain the following
#'                    information with these headings:
#'
#'                         \code{StationID} - an alpha-numeric identifier of the location;
#'
#'                         \code{Replicate} - a numeric identifying the replicate number of samples taken at the location;
#'
#'                         \code{SampleDate} - the date of sample collection;
#'
#'                         \code{Latitude} - latitude in decimal degrees;
#'
#'                         \code{Longitude} - longitude in decimal degrees. Make sure there is a negative sign for the Western coordinates;
#'
#'                         \code{Taxon} - name of the fauna, ideally in SCAMIT ed12 format, do not use sp. or spp.,
#'        use sp only or just the Genus. If no animals were present in the sample use
#'        NoOrganismsPresent with 0 abundance;
#'
#'                         \code{Abundance} - the number of each Species observed in a sample;
#'
#'                         \code{Salinity} - the salinity observed at the location in PSU, ideally at time of sampling;
#'
#'                         \code{Stratum} - ;
#'
#'                         \code{Exclude} - ;
#'
#' @usage
#' RIVPACS(benthic_data)
#'
#' @import dplyr
#' @importFrom tidyr pivot_wider
#'
#' @examples
#' data(benthic_sampledata)
#' RIVPACS(benthic_sampledata)
#'
#' @export
#'

#---- RIVPACS WRAPPER FUNCTION ----
# This is what we will use for RIVPACS
RIVPACS <- function(benthic_data){

  # Split to SoCal and SFBay.
  ## We are only working with SoCal data so we don't need to do this!

  #scb.station <- station[toupper(station$HabitatCode) == "C", ]

  #########################
  # At this point of the SQOUnified package, we are only working with SoCal data so we don't need sfb
  #sfb.station <- station[toupper(station$HabitatCode) == "D", ]

  # If data exists for habitat, format data.

  #if(nrow(scb.station) > 0) {




  scb.predictors <- data.frame(Latitude = benthic_data$Latitude,
                               Longitude = benthic_data$Longitude,
                               SampleDepth = benthic_data$SampleDepth) %>%
    dplyr::distinct()

  benthic_data <- benthic_data %>% dplyr::rename(Taxa = Taxon)

  scb.taxa <- benthic_data %>% dplyr::select(StationID, Latitude, Longitude, SampleDepth) %>%
    dplyr::distinct()

  row.names(scb.predictors) <- scb.taxa$StationID

  scb.predictors <- as.matrix(scb.predictors)

  # Don't need this line because all needed info is in benthic_data data frame
  # and we're only doing SCB data, so no need to classify them
  #scb.taxa <- benthic_data[benthic_data$StationID %in% scb.station$StationID, ]
  scb.taxa <- benthic_data %>%
    dplyr::filter(Replicate == 1) %>%
    dplyr::select(StationID, Taxa, Abundance) %>%
    dplyr::distinct()

  scb.taxa$Taxa <- gsub(" ", "_", scb.taxa$Taxa, fixed = TRUE)
  scb.taxa$Taxa <- gsub("(", "_", scb.taxa$Taxa, fixed = TRUE)
  scb.taxa$Taxa <- gsub(")", "_", scb.taxa$Taxa, fixed = TRUE)

  scb.taxa <- scb.taxa %>%
    tidyr::pivot_wider(id_cols = "StationID", names_from = "Taxa",
                       values_from = "Abundance", values_fn = list(Abundance = list))
  scb.taxa <- as.data.frame(scb.taxa)

  scb.taxa <- scb.taxa[, -1]

  colnames(scb.taxa) <- gsub("Abundance.", "", colnames(scb.taxa))

  # Replace NAs with zero.
  scb.taxa[scb.taxa == "NULL"] <- 0
  scb.taxa = as.data.frame(lapply(scb.taxa, as.numeric))
  row.names(scb.taxa) <- row.names(scb.predictors)

  # RIVPACS calculations. By default the functions use the example user data.
  socal <- SoCalRivpacs(observed.predictors = scb.predictors, observed.taxa = scb.taxa)

  # the stations column of the oe table dataframe wwas being returned as a factor. Need to make that a character
  socal$oe.table <- socal$oe.table %>%
    mutate_if(is.factor,as.character)

  benthic_data <- benthic_data %>%
    #dplyr::rename(B13_Stratum = Stratum) %>%
    dplyr::select(StationID, Replicate, SampleDate, Stratum) %>%
    dplyr::distinct()

  rivpacs.score <- socal$oe.table %>%
    dplyr::select(stations, O.over.E) %>%
    dplyr::rename(StationID = stations, Score = O.over.E) %>%
    dplyr::full_join(benthic_data) %>%
    dplyr::mutate(Index = "RIVPACS") %>%
    dplyr::mutate(Category = case_when((Score <= 0.32) ~ "High Disturbance",
                                       ((Score > 0.32 & Score <= 0.74) | (Score >= 1.26)) ~ "Moderate Disturbance",
                                       ((Score > 0.74 & Score <= 0.90) | Score >= 1.10 & Score < 1.26) ~ "Low Disturbance",
                                       (Score > 0.90 | Score < 1.10) ~ "Reference")) %>%
    dplyr::mutate(`Category Score` = case_when(Category == "Reference" ~ 1,
                                             Category == "Low Disturbance" ~ 2,
                                             Category == "Moderate Disturbance" ~ 3,
                                             Category == "High Disturbance" ~ 4)) %>%
    dplyr::select(StationID, SampleDate, Replicate, Stratum, Index, Score, Category, `Category Score`)


  return(rivpacs.score)
}



# ---- ORIGINAL RIVPACS FUNCTION ----
# This is the original RIVPACS function. Called by the above function.
SoCalRivpacs <- function(Pcutoff = 0.5,
                         reference.groups = socal.reference.groups,
                         observed.predictors = socal.example.habitat,
                         reference.taxa = socal.reference.taxa,
                         group.means = socal.reference.group.means,
                         reference.cov = socal.reference.covariance,
                         observed.taxa = socal.example.taxa) {

  # Pcutoff is the probability cutoff

  # Names of predictor variables.
  predictor.variables <- c("Latitude", "Longitude", "SampleDepth")

  # ----- Format Observed Data -----

  FormatObservedData <- function() {

    # Align observed (user) data columns with reference data columns. Columns in same
    # order. Observed data may have a different number of taxa (columns) than
    # reference data.

    # Convert observed.taxa to presence/absence (0/1)
    tmp.pa <- observed.taxa
    #tmp.pa <- lapply(tmp.pa, as.numeric)
    tmp.pa[tmp.pa > 0] <- 1

    # Align rows using predictor variables.
    tmp.pa <- tmp.pa[row.names(observed.predictors), ]    # !!! is this required???


    # Container matrix.
    n.observed.sites <- dim(tmp.pa)[1]
    n.reference.taxa <- dim(reference.taxa)[2]

    observed.taxa.pa <- matrix(rep(0, times = n.observed.sites * n.reference.taxa),
                               nrow = n.observed.sites, ncol = n.reference.taxa,
                               dimnames = list(rownames(tmp.pa), names(reference.taxa)))

    # Fill container with observed data.
    col.match <- match(dimnames(observed.taxa.pa)[[2]], dimnames(tmp.pa)[[2]])
    #tmp.pa <- as.data.frame(lapply(tmp.pa, as.numeric))

    for(i in 1:n.reference.taxa) {

      if(!is.na(col.match[i])) observed.taxa.pa[, i] <- tmp.pa[, col.match[i]]

    }

    # The matrix observed.taxa.pa contains the observed.scores used for O/E.
    return(observed.taxa.pa)

  }

  observed.data <- FormatObservedData()

  # ----- Calculate Expected Data -----

  CalculateExpectedData <- function() {

    # Calculate probability of sites belonging to groups. Follow RIVPACS assumption
    # of weighting the group probabilities by reference group size. Flags outlier
    # sites, using the chi-squared statistic.

    # Definitions.
    n.predictor.variables <- length(predictor.variables)
    group.size <- table(reference.groups)
    n.groups <- length(group.size)

    # Chi-squared values for flagging outlier samples.
    degrees.freedom <- min(c(n.predictor.variables, (n.groups - 1)))
    crit.01 <- qchisq(0.99, df = degrees.freedom)
    crit.05 <- qchisq(0.95, df = degrees.freedom)

    # Container for probabilities.
    n.observed.sites.filtered <- dim(observed.predictors)[[1]]

    group.probabilities <- matrix(rep(0, n.observed.sites.filtered * n.groups),
                                  nrow = n.observed.sites.filtered,
                                  dimnames = list(dimnames(observed.predictors)[[1]],
                                                  dimnames(group.means)[[1]]))

    # Container for outlier flags and minimum distance.
    outlier.flag <- data.frame(outlier.05 = rep(0, n.observed.sites.filtered),
                               outlier.01 = rep(0, n.observed.sites.filtered),
                               min.distance = rep(0, n.observed.sites.filtered),
                               row.names = dimnames(observed.predictors)[[1]])

    # calculate group membership probabilities for each sample and find outliers.
    for(i in 1:n.observed.sites.filtered) {

      # Squared Mahalanobis distance from current sample to each group mean.
      distances <- mahalanobis(group.means,
                               observed.predictors[i,],
                               reference.cov,
                               inverted = TRUE)

      group.probabilities[i,] <- group.size * exp(-0.5 * distances) # see Clarke et al. (2000)
      group.probabilities[i,] <- group.probabilities[i, ] / sum(group.probabilities[i, ])

      # Outlier criteria is minimum distance.
      outlier.flag$min.distance[i] <- min(distances)

      # Check for outliers. Each sample is either a pass (0) or fail (1).
      if(outlier.flag$min.distance[i] > crit.05) outlier.flag[i, "outlier.05"] <- 1
      if(outlier.flag$min.distance[i] > crit.01) outlier.flag[i, "outlier.01"] <- 1

    }

    # Occurrence frequencies of all taxa in the reference groups.
    freq.in.group <- apply(reference.taxa, 2,
                           function(x){tapply(x, reference.groups, function(y){sum(y) / length(y)})})

    # Matrix algebra form of the RIVPACS combining formula (Clarke et al. 2003, Eq. 4).
    predicted.prob.all <- group.probabilities %*% freq.in.group

    # predicted.prob.all are the predicted (expected) probabilites.
    expected.data <- list(predicted = predicted.prob.all, outliers = outlier.flag, n = n.observed.sites.filtered)

  }

  expected.data <- CalculateExpectedData()

  # ----- Calculate Scores -----

  CalculateScores <- function() {

    observed.score <- vector(mode = "numeric", length = expected.data$n)
    expected.score <- vector(mode = "numeric", length = expected.data$n)
    BC <- vector(mode = "numeric", length = expected.data$n) # Bray-Curtis dissimilarity



    for(i in 1:expected.data$n) {
      tryCatch(
        {
          predicted.prob <- expected.data$predicted[i, ] # predicted probabilities for current sample

          taxa.subset <- names(predicted.prob)[predicted.prob >= Pcutoff]  # subset of taxa with probabilities >= Pcutoff

          expected.prob <- predicted.prob[taxa.subset] # probabilites for subset of included taxa

          observed.pa <- observed.data[i, taxa.subset] # observed presence/absence for those taxa

          observed.score[i] <- sum(observed.pa) # observed richness (O)
          expected.score[i] <- sum(expected.prob) # expected richness (E)
          BC[i] <- sum(abs(observed.pa - expected.prob)) /
            (observed.score[i] + expected.score[i]) # BC value
        },
        error = function(e) {
          observed.score[i] <- NA_real_ # observed richness (O)
          expected.score[i] <- NA_real_ # expected richness (E)
          BC[i] <- NA_real_ # BC value
        }
      )


    }

    O.over.E <- observed.score/expected.score

    stats <- data.frame(stations = row.names(observed.predictors),
                        O = observed.score,
                        E = round(expected.score, digits = 4),
                        O.over.E = round(O.over.E, digits = 4))

    #     stats <- data.frame(stations = row.names(observed.predictors),
    #                         O = observed.score,
    #                         E = expected.score,
    #                         O.over.E = O.over.E)

    stats$outlier.05 <- expected.data$outliers$outlier.05
    stats$outlier.01 <- expected.data$outliers$outlier.01

    # Convert to "PASS" or "FAIL"
    stats$outlier.05[stats$outlier.05 == 0] <- "PASS"
    stats$outlier.05[stats$outlier.05 == 1] <- "FAIL"

    stats$outlier.01[stats$outlier.01 == 0] <- "PASS"
    stats$outlier.01[stats$outlier.01 == 1] <- "FAIL"

    #   mean.O.over.E <- mean(OE.stats$O.over.E)
    #   stdev.O.over.E <- sqrt(var(OE.stats$O.over.E))


    return(stats)

  }

  results <- list(oe.table = CalculateScores(),
                  observed = observed.data,
                  predicted = expected.data$predicted,
                  Pcutoff = Pcutoff,
                  region = "scb")

}
SCCWRP/SQOUnified_archive documentation built on March 30, 2022, 12:14 a.m.