R/utils.R

Defines functions assign_na_age assign_alk_attributes rename_num_col rename_age_col rename_size_col sanitize_laa_data rename_laa_cols check_laa_data bin_lengths min_age_groups min_count_laa_data adjust_plus_min_ages adjust_plus_min_ages_df check_model_type ages_as_integer ages_as_ordered_factor add_spp_level rm_spp_level spp_level is_spp_in_data is_spp_in_levels check_length_data check_age_data

Documented in add_spp_level adjust_plus_min_ages adjust_plus_min_ages_df ages_as_integer ages_as_ordered_factor assign_alk_attributes assign_na_age bin_lengths check_age_data check_length_data check_model_type is_spp_in_data is_spp_in_levels min_age_groups min_count_laa_data rename_laa_cols rm_spp_level spp_level

#' Check for age/length data in the data being estimated or predicted
#'
#' These are just simple helper functions used within other functions that check
#' to make sure that ages and lengths are present in the data and stop the
#' fucntion call if they are missing
#'
#' @param data A data.frame
#' @param age_col Character. The column name for the age column in \code{data}
#'
#' @return NULL. An error will be called if age/length data is missing
#'
#' @name check_agelen_data
check_age_data <- function(data, age_col) {
  if (!(age_col %in% colnames(data))) {
    stop(
      "You are missing an age column in your data. ",
      "It's hard to create an age estimator without some known ages.\n",
      "Perhaps try setting the age_col argument."
    )
  }
}

#' @rdname check_agelen_data
#' @param size_col Character. The column name for the size column in \code{data}
check_length_data <- function(data, size_col) {
  if (!(size_col %in% colnames(data))) {
    stop(
      "You are missing a size column in your data. ",
      "I won't know how to age anything without sizes.\n",
      "Perhaps try setting the size_col argument"
    )
  }
}

#' Check for species in columns and/or levels and add to levels if present
#'
#' These helper functions just check to see if a species column exists in the
#' data (designated as 'spp' or 'species'). If one of those columns exists,
#' but the column name is not in the levels argument it will get added to
#' levels.
#'
#' @param data A data.frame with length-at-age data
#' @param levels The levels argument passed from \code{\link{make_halk}}
#'
#' @name spp_levels
#'
#' @return A character vector of levels possibly with 'spp' or 'species' added
is_spp_in_levels <- function(levels) {
  return(any(grepl("^spp$|^species$", levels, ignore.case = TRUE)))
}

#' @rdname spp_levels
is_spp_in_data <- function(data) {
  return(any(grepl("^spp$|^species$", names(data), ignore.case = TRUE)))
}

#' @rdname spp_levels
spp_level <- function(levels) {
  if (is_spp_in_levels(levels)) {
    return(levels[grep("^spp$|^species$", levels, ignore.case = TRUE)])
  } else {
    return(NULL)
  }

}

#' @rdname spp_levels
rm_spp_level <- function(levels) {
  if (is_spp_in_levels(levels)) {
    out <- levels[-grep("^spp$|^species$", levels, ignore.case = TRUE)]
    if (length(out) == 0) {
      return(NULL)
    } else {
      return(out)
    }
  } else {
    return(levels)
  }
}

#' @rdname spp_levels
add_spp_level <- function(data, levels) {
  spp_col <- names(data)[grepl("^spp$|^species$", names(data), ignore.case = TRUE)]
  if (length(spp_col) == 1) {
    unique_spp <- unique(data[[spp_col]])
    if (length(unique_spp) == 1) {
      return(levels)
    } else if (!(spp_col %in% levels)) {
      levels <- c(spp_col, levels)
      warning(
        "You presumably have a species column designated as ", spp_col,
        ". I'm adding it to levels.\n",
        "You can add it to levels manually to suppress this warning."
      )
    }
  } else if (length(spp_col) > 1) {
    unique_spp <- unique(data[[spp_col[1]]])
    if (length(unique_spp) == 1) {
      return(levels)
    } else if (!(spp_col %in% levels)) {
      levels <- c(spp_col[1], levels)
      warning(
        "You have multiple columns designated for species.\n",
        paste(paste0("  *", spp_col), collapse = "\n"),
        ".\nI'm adding the first to levels. ",
        "Manually add one or more of these to levels to suppress this warning."
      )
    }
  }
  return(levels)
}

#' Convert ages from/to ordered factor
#'
#' In order for the machine learning models to properly predict ages, the
#' known ages should be converted to an ordered factor during model fitting.
#' This will ensure that the predict.* functions return age values that actually
#' make sense.
#'
#' @param data A data.frame with a column corresopnding to \code{age_col} or
#' a vector of values
#' @param age_col Character. The name of the column that contains ages
#'
#' @return A data.frame with the values in \code{age_col} converted to an
#' ordered factor
#'
#' @name ages_as_ordered
ages_as_ordered_factor <- function(data, age_col = "age") {
  if (is.vector(data)) {
    out <- as.ordered(data)
  } else {
    out <-
      data %>%
      dplyr::mutate_at(dplyr::vars(!!rlang::sym(age_col)), as.ordered)
  }
  return(out)
}

#' @rdname ages_as_ordered
ages_as_integer <- function(data, age_col = "est.age") {
  if (is.ordered(data) || is.factor(data)) {
    out <- as.integer(as.character(data))
  } else {
    out <-
      data %>%
      dplyr::mutate_at(dplyr::vars(!!rlang::sym(age_col)), function(x) {
        as.integer(as.character(x))
      })
  }
  return(out)
}

#' Check the model type and return standardized version
#'
#' This is a non-exported function to check whether the model type specified is
#' available and return a standardized version of the model name. This
#' standardized version will then feed into a S3 method for the given model.
#'
#' @param model A character string naming the model
#'
#' @return A standardized version of the model name, or an error if
#' \code{model} doesn't exist yet
check_model_type <- function(model) {
  model_standard <- gsub("\\s+|-", "_", trimws(model))
  if (!(model_standard %in% model_types$model)) {
    stop("Can't find a model type for ", model)
  } else {
    model_name <- model_types$model_name[model_types$model == model_standard]
    return(model_name)
  }
}

#' Adjusts data to account for plus group or minimum age
#'
#' These functions performs two tasks. It lumps all ages greater than the
#' plus group into that age, and it filters data only to those greater than
#' or equal to the minimum age. \code{adjust_plus_min_ages} works on a vector
#' whereas \code{adjust_plus_min_ages_df} words on a data.frame
#'
#' @param data Data with age as a column, or a numeric vector of ages
#' @param minage Numeric. The minimum age; everything else is excluded
#' @param pls_grp Numeric. The plus group; all ages older will be lumped into
#' this group
#'
#' @name adjust_ages
#'
#' @return A data.frame similar to \code{data}, but with ages less than
#' \code{minage} excluded and ages >= \code{plus_group} aggregated into that age
adjust_plus_min_ages_df <- function(data, minage = NULL, pls_grp = NULL) {
  if (!is.null(minage)) {
    data <-
      data %>%
      dplyr::filter(.data$age >= minage)
  }
  if (!is.null(pls_grp)) {
    data <-
      data %>%
      dplyr::mutate(age = dplyr::case_when(
        .data$age > pls_grp ~ as.integer(pls_grp),
        TRUE ~ as.integer(.data$age)
      ))
  }
  return(data)
}

#' @rdname adjust_ages
#' @param age_vec A vector of ages
adjust_plus_min_ages <- function(age_vec, minage = NULL, pls_grp = NULL) {
  if (!is.null(minage)) {
    age_vec[age_vec < minage] <- minage
  }
  if (!is.null(pls_grp)) {
    age_vec[age_vec > pls_grp] <- pls_grp
  }
  return(age_vec)
}


#' Count number of length-at-age samples or age groups at each level and return
#' those with greater than equal to the minimum desired number
#'
#' These are helper shortcut functions to determine if data meet the minimum
#' desired number of age groups and/or sample sizes.
#'
#' @param data Data.frame with length-at-age data
#' @param sub_levels The levels at which to check
#' @inheritParams make_alk
#'
#' @return A data.frame just like \code{data}, but with samples excluded that
#' don't meet the required number of samples in \code{min_sample_size}
#'
#' @name min_samples
min_count_laa_data <- function(data, sub_levels = NULL,
                               min_age_sample_size = NULL,
                               min_total_sample_size = NULL,
                               min_age_groups = NULL) {
  if (is.null(data)) {
    return(NULL)
  }
  if (!is.null(min_total_sample_size)) {
    if (is.null(sub_levels)) {
      if (nrow(data) >= min_total_sample_size) {
        data <- data
      } else {
        return(NULL)
      }
    } else {
      sub_grouping <-
        sub_levels %>%
        rlang::syms()
      laa_counts <-
        data %>%
        dplyr::count(!!!sub_grouping) %>%
        dplyr::ungroup() %>%
        dplyr::group_by(!!!rlang::syms(sub_levels)) %>%
        dplyr::filter(.data$n >= min_total_sample_size) %>%
        dplyr::ungroup()
      data <-
        data %>%
        dplyr::inner_join(laa_counts, by = sub_levels) %>%
        dplyr::select(-"n")
    }
  }
  if (!is.null(min_age_sample_size)) {
    sub_grouping <-
      sub_levels %>%
      c("age") %>%
      unlist() %>% rlang::syms()
    laa_counts <-
      data %>%
      dplyr::count(!!!sub_grouping) %>%
      dplyr::ungroup() %>%
      dplyr::group_by(!!!rlang::syms(sub_levels)) %>%
      # changed this to include any data that meets minimum standards
      # even if certain age groups don't
      # (i.e. older age groups with single individual) - PNF 11/22/2023
      dplyr::mutate(age_grps_gt_min = sum(.data$n >= min_age_sample_size)) %>%
      dplyr::filter(.data$age_grps_gt_min >= min_age_groups) %>%
      # dplyr::filter(dplyr::if_any(.data$n, ~.x >= min_age_sample_size)) %>%
      dplyr::ungroup()
    data <-
      data %>%
      dplyr::inner_join(laa_counts, by = c(sub_levels, "age")) %>%
      dplyr::select(-"n")
  }
  if (nrow(data) == 0) {
    return(NULL)
  } else {
    return(data)
  }
}

#' @rdname min_samples
#' @param min_age_grps The minimum number of age groups that must be present
#' in data to create an ALK
min_age_groups <- function(data, sub_levels = NULL, min_age_grps) {
  if (is.null(data)) {
    return(NULL)
  }
  if (is.null(sub_levels)) {
    if (length(unique(data$age)) >= min_age_grps) {
      return(data)
    } else {
      return(NULL)
    }
  } else {
    sub_grouping <- rlang::syms(sub_levels)
    age_grp_counts <-
      data %>%
      dplyr::group_by(!!!sub_grouping) %>%
      dplyr::summarize(n_age_grps = length(unique(.data$age)), .groups = "drop") %>%
      dplyr::filter(.data$n_age_grps >= min_age_grps)
    temp_age_data <-
      data %>%
      dplyr::inner_join(age_grp_counts, by = sub_levels) %>%
      dplyr::select(-"n_age_grps")
    if (nrow(temp_age_data) == 0) {
      return(NULL)
    } else {
      return(temp_age_data)
    }
  }
}

#' Convert a vector of lengths into binned values
#'
#' This will take a vector of numeric values and bin them according to the value
#' specified in binwidth
#'
#' @param x Numeric vector of values
#' @param binwidth Numeric vector specifying how wide the length bins should be
#' @param include_upper Logical. Append the upper value of the bin and return
#' the length range as a character string (TRUE), or return the lower value as
#' numeric (FALSE, default)
#' @param ... Additional arguments passed onto \code{\link[base]{cut}}
#'
#' @return A vector of values the same length as x, but binned to the values
#' according to binwidth
#' @export
#'
#' @examples
#' bin_lengths(length_data$length, binwidth = 2)
bin_lengths <- function(x, binwidth, include_upper = FALSE, ...) {
  if (length(x) == 0) {
    stop("You have no length data to bin")
  }
  bins <- seq(0, ceiling(max(x, na.rm = TRUE) + binwidth), by = binwidth)
  if (include_upper) {
    cut_bins <- cut(x, bins, right = FALSE, ...)
    bin_levels <- levels(cut_bins)
    brackets_regex <- "^\\[|^\\(|\\]$|\\)$"
    bin_levels <- gsub(brackets_regex, "", bin_levels)
    bin_levels <- gsub(",", "-", bin_levels)
    tmp <- gsub(brackets_regex, "", as.character(cut(x, bins)))
    out <- gsub(",", "-", tmp)
    out <- ordered(out, levels = bin_levels)
  } else {
    tmp <- gsub(",.*$", "", as.character(cut(x, bins, right = FALSE, ...)))
    out <- as.numeric(gsub("^\\[", "", tmp))
  }
  return(out)
}

check_laa_data <- function(df, quiet = FALSE) {
  if (is.null(df) || nrow(df) == 0) {
    if (!quiet) {
      warning(
        "You have no data to make an alk with. ",
        "Consider change plus group or minimum age."
      )
    }
    return(NULL)
  } else {
    return(df)
  }
}

#' Simple helper function to rename size and age column names to age and length
#'
#' @param data Any data.frame with some columns representing age and size
#' @param size_col Character. The name of the column containing sizes
#' @param age_col Character. The name of the column containing ages
#' @param num_col Character. The name of the column containing number of
#' individuals
#' @param goback Logical. Reverse names once they've already been renamed
#'
#' @return A data.frame the same as \code{data}, but with names changed
rename_laa_cols <- function(data,
                            size_col = "length",
                            age_col = "age",
                            num_col = NULL,
                            goback = FALSE) {
  out <-
    data %>%
    rename_size_col(sc = size_col, back = goback) %>%
    rename_age_col(ac = age_col, back = goback)
  if (!is.null(num_col)) {
    out <- rename_num_col(out, num_col, back = goback)
  }
  return(out)
}

# simple helper function to remove NA values from length-at-age data
sanitize_laa_data <- function(data) {
  out <-
    data %>%
    dplyr::filter(!is.na(.data$length), !is.na(.data$age))
  return(out)
}



# keeping this in case I screwed anything up parsing this fucntion out
# PNF - 04/22/2022
# rename_laa_cols_bkp <- function(data,
#                             size_col = "length",
#                             age_col = "age",
#                             num_col = NULL,
#                             goback = FALSE) {
#   if (goback) {
#     size_col_ind <- grep("^length$", names(data))
#     age_col_ind <- grep("^age", names(data))
#     names(data)[size_col_ind] <- size_col
#     names(data)[age_col_ind] <- gsub("age", age_col, names(data)[age_col_ind])
#     if (!is.null(num_col)) {
#       num_col_ind <- grep("^n$", names(data))
#       names(data)[num_col_ind] <- num_col
#     }
#   } else {
#     size_col_ind <- grep(paste0("^", size_col, "$"), names(data))
#     age_col_ind <- grep(paste0("^", age_col, "$"), names(data))
#     names(data)[size_col_ind] <- "length"
#     names(data)[age_col_ind] <- gsub(age_col, "age", names(data)[age_col_ind])
#     if (!is.null(num_col)) {
#       num_col_ind <- grep(num_col, names(data))
#       names(data)[num_col_ind] <- "n"
#     }
#   }
#   return(data)
# }

rename_size_col <- function(data, sc = "length", back = FALSE) {
  if (back) {
    size_col_ind <- grep("^length$", names(data))
    names(data)[size_col_ind] <- sc
  } else {
    size_col_ind <- grep(paste0("^", sc, "$"), names(data))
    names(data)[size_col_ind] <- "length"
  }
  return(data)
}


rename_age_col <- function(data, ac = "age", back = FALSE) {
  if (back) {
    age_col_ind <- grep("^age", names(data))
    names(data)[age_col_ind] <- gsub("age", ac, names(data)[age_col_ind])
  } else {
    age_col_ind <- grep(paste0("^", ac, "$"), names(data))
    names(data)[age_col_ind] <- gsub(ac, "age", names(data)[age_col_ind])
  }
  return(data)
}

rename_num_col <- function(data, nc = "n", back = FALSE) {
  if (back) {
    num_col_ind <- grep("^n$", names(data))
    names(data)[num_col_ind] <- nc
  } else {
    num_col_ind <- grep(nc, names(data))
    names(data)[num_col_ind] <- "n"
  }
  return(data)
}



#' Assign associated age-length key attributes to a data.frame
#'
#' This is just a helper function to assign the needed attributes and classes
#' to a data.frame that is produced by either \code{\link{make_alk}} or
#' \code{\link{make_halk}}.
#'
#' @param data A data.frame
#' @param size_col Character. Name of the column representing sizes
#' @param age_col Character. Name of the column representing ages
#' @param autobin Logical to set the attribute of autobin
#' @param size_bin Numeric. What is the width of size bins
#' @param min_age Numeric. The minimum age that was included in the alk
#' @param plus_group Numeric. The age that represents the plus group
#' @param alk_n Numeric. The number of samples that went into creating the alk
#' @param classes Character. The class that should get prepended to the
#' data.frame class(es)
#' @param dnorm_params The value of parameters that went into creating the
#' normal distributions on the age groups
#' @param levels Character vector of the levels used. This creates the "levels"
#' attribute if present
#'
#' @return A data.frame with associated attributes assigned
assign_alk_attributes <- function(data,
                                  size_col = "length",
                                  age_col = "age",
                                  autobin = TRUE,
                                  size_bin = 1,
                                  min_age = NULL,
                                  plus_group = NULL,
                                  alk_n = NULL,
                                  classes = "alk",
                                  dnorm_params = NULL,
                                  levels = NULL) {
  attr(data, "size_col") <- size_col
  attr(data, "age_col") <- age_col
  attr(data, "autobin") <- autobin
  attr(data, "size_bin") <- size_bin
  attr(data, "min_age") <- min_age
  attr(data, "plus_group") <- plus_group
  attr(data, "alk_n") <- alk_n
  if (!is.null(levels)) {
    attr(data, "levels") <- levels
  }
  if (!is.null(dnorm_params)) {
    stopifnot(autobin & size_bin <= 1)
    attr(data, "dnorm_params") <- dnorm_params
  }
  class(data)  <- c(classes, class(data))
  return(data)
}

#' Simple function that returns NA values
#'
#' A vector of NA will be returned that is the length of \code{x}
#'
#' @param x Any vector of any length
#'
#' @return A vector the same length as x containing only NA values
assign_na_age <- function(x) {
  return(NA)
}

Try the halk package in your browser

Any scripts or data that you put into this service are public.

halk documentation built on May 29, 2024, 5:22 a.m.