R/misc.R

Defines functions plot_prediction as_patch lump_vars prep_dataset

Documented in as_patch lump_vars plot_prediction prep_dataset

#' Prep a dataset by removing variables, remove incomplete cases,
#'  aggregating specified columns, and applying threshold/labeling to the aggregated column
#'  
#' @export
#' @param x tibble of input data
#' @param drop_var character, the variables to deselect
#' @param drop_fun function, such as \code{\link[dplyr]{starts_with}}
#' @param complete_cases_only logical, if TRUE drop incomplete records (after deselection)
#' @param lump_var character character, vector of variables to sum into a new column
#' @param lump_fun function, such as \code{\link[dplyr]{starts_with}}
#' @param log_var character, variables to log scale (base 10), match with \code{\link[dplyr]{any_of}}
#' @param newname character the name of the new aggregated colum
#' @param threshold numeric, the threshold(s) used to define patches
#' @return a tibble cleaned and transformed for downstream processing
prep_dataset <- function(x = read_dataset(),
                         complete_cases_only = TRUE,
                         lump_var = "Calanus finmarchicus",
                         lump_fun = dplyr::starts_with,
                         drop_var = c("geometry",
                                      "longitude",
                                      "latitude",
                                      "station",
                                      "year",
                                      "siconc",
                                      "sithick",
                                      complement_species(lump_var)),
                         drop_fun = dplyr::starts_with,
                         log_var = c("bathymetry", "chlor_a"),
                         newname = "patch",
                         threshold = 10000){
  
  if (length(drop_var) > 0) x <- dplyr::select(x, -drop_fun(drop_var)) 
  
  if(complete_cases_only) x <- na.omit(x)

  # log scale as requested
  if (length(log_var) > 0) {
    x <- dplyr::mutate(x, dplyr::across(dplyr::any_of(log_var),  ~ log10(abs(.x) + 0.00001) ))
  }
  
  # aggregate the species
  x <- calanusthreshold::lump_vars(x, 
                                  vars = lump_var,
                                  selector = lump_fun,
                                  newname = newname)
  # now apply threshold
  x[[newname]] <- calanusthreshold::as_patch(x[[newname]], threshold) 
  x
}

#' Lump variables in a table together by summing
#' 
#' @export
#' @param x tibble
#' @param vars character, one or more variables to lump
#' @param selector function the function used to select the variables,
#'   see \code{tidyselect} helpers.
#' @param newname character, the name of the new (lumped) columns
#' @param na.rm logical, if TRUE lump after removing NA values
#' @param drop logical, if TRUE drop the \code{vars}
#' @return tibble
lump_vars <- function(x = read_dataset(),
                      vars = c("Calanus finmarchicus IV", 
                               "Calanus finmarchicus V", 
                               "Calanus finmarchicus VI", 
                               "Calanus hyperboreus IV", 
                               "Calanus hyperboreus V", 
                               "Calanus hyperboreus VI", 
                               "Calanus glacialis IV", 
                               "Calanus glacialis V", 
                               "Calanus glacialis VI"),
                      selector = dplyr::all_of,
                      newname = "lumped",
                      na.rm = TRUE, 
                      drop = TRUE){

  if (inherits(x, "sf")){
    lumped <- x |>
      sf::st_drop_geometry() |>
      dplyr::select(selector(vars)) |>
      rowSums(na.rm = na.rm)
  } else {
    lumped <- x |>
      dplyr::select(selector(vars)) |>
      rowSums(na.rm = na.rm)
  }
  
  x <- x |>
    dplyr::mutate(!!newname := lumped)
  
  if (drop){
    x <- x |> 
    dplyr::select(-selector(vars))
  }
  x
}


#' Convert abundance to patch density
#' 
#' @export
#' @param x a vector of abundances
#' @param threshold numeric boundaries defining levels of patchiness from low to high.
#'   Values in \code{x} below the lowest threshold are considered "no-patch"
#' @param form character, one of 'index' of 'factor' to control the output type
#'   in either case NAs are propagated.
#' @return numeric indices into threshold, possibly as factors.  Zero means no patch.
as_patch <- function(x, 
                     threshold = list(10000, c(2500, 5000, 7500, 10000))[[1]],
                     form = c("index", "factor")[2]){

  ix <- findInterval(x, threshold)
  if (tolower(form[1]) == "factor") {
    ix <- factor(ix, levels = seq(from = 0L, by = 1L, length = length(threshold)+1))
  }
  ix
}

#' Plot prediction rasters (patch and probability)
#' 
#' @export
#' @param x \code{stars} object with two attributes
#' @param zlim 2 element vector of limits for probability, or 
#'   "fit" to scale to the range of prob values 
#' @return 2 element list of ggplot2 objects ("patch", "prob")
plot_prediction <- function(x, zlim = list("fit", c(0,1))[[1]]){
  
  patch <- ggplot2::ggplot() +
    stars::geom_stars(data = x['patch'] |>
                        dplyr::mutate(patch = factor(.data$patch, levels = c(1, 0, NA)))) + 
    ggplot2::scale_fill_manual(values=c("1"="#D55E00", "0"="#559E7380", "NA"="#99999980")) +
    ggplot2::coord_equal() +
    ggplot2::labs(x = "Lon", y = "Lat")
  
  if (inherits(zlim, 'character')){
    zlim <- c(0, max(x$prob))
  }
    
  prob <- ggplot2::ggplot() + 
    stars::geom_stars(data = x['prob']) + 
    ggplot2::scale_fill_continuous(low="white",high="orange", limits=zlim) +
    ggplot2::coord_equal() +
    ggplot2::labs(x = "Lon", y = "Lat")
  
  list(patch = patch, prob = prob)
}
BigelowLab/calanusthreshold documentation built on May 12, 2022, 5:06 a.m.