R/create_data.R

Defines functions create_data

Documented in create_data

#' Create data file for fitting time varying run timing distributions with TMB
#'
#' Does minimal processing of data to use as argument to fitting function
#'
#' @param data A data frame
#' @param min_number A minimum threshold to use, defaults to 0
#' @param variable A character string of the name of the variable in 'data' that contains the response (e.g. counts)
#' @param time A character string of the name of the variable in 'data' that contains the time variable (e.g. year)
#' @param date A character string of the name of the variable in 'data' that contains the response (e.g. day of year). The actual
#' #' column should contain a numeric response -- for example, the result from using lubridate::yday(x)
#' @param mu An optional formula allowing the mean to be a function of covariates. Random effects are not included in the formula
#' but specified with the `est_mu_re` argument
#' @param sigma An optional formula allowing the standard deviation to be a function of covariates. For asymmetric models,
#' each side of the distribution is allowed a different set of covariates. Random effects are not included in the formula
#' but specified with the `est_sigma_re` argument
#' @param covar_data a data frame containing covariates specific to each time step. These are used in the formulas `mu` and `sigma`
#' @param asymmetric_model Boolean, whether or not to let model be asymmetric (e.g. run timing before peak has a
#' different shape than run timing after peak)
#' @param est_sigma_re Whether to estimate random effects by year in sigma parameter controlling tail of distribution. Defaults to TRUE
#' @param est_mu_re Whether to estimate random effects by year in mu parameter controlling location of distribution. Defaults to TRUE
#' @param tail_model Whether to fit Gaussian ("gaussian"), Student-t ("student_t") or generalized normal ("gnorm"). Defaults to Student-t
#' @param family Response for observation model, options are "gaussian", "poisson", "negbin", "binomial", "lognormal". The default ("lognormal") is
#' not a true lognormal distribution, but a normal-log in that it assumes log(y) ~ Normal()
#' @param max_theta Maximum value of log(pred) when `limits=TRUE`. Defaults to 10
#' @param share_shape Boolean argument for whether asymmetric student-t and generalized normal distributions should share the shape parameter (nu for the student-t;
#' beta for the generalized normal). Defaults to TRUE
#' @param nu_prior Two element vector (optional) for penalized prior on student t df, defaults to a Gamma(shape=2, scale=10) distribution
#' @param beta_prior Two element vector (optional) for penalized prior on generalized normal beta, defaults to a Normal(2, 1) distribution
#' @export
#' @importFrom stats model.matrix
#' @examples
#' data(fishdist)
#' datalist <- create_data(fishdist,
#'   min_number = 0, variable = "number", time = "year",
#'   date = "doy", asymmetric_model = TRUE, family = "gaussian"
#' )
create_data <- function(data,
                        min_number = 0,
                        variable = "number",
                        time = "year",
                        date = "doy",
                        asymmetric_model = TRUE,
                        mu = ~1,
                        sigma = ~1,
                        covar_data = NULL,
                        est_sigma_re = TRUE,
                        est_mu_re = TRUE,
                        tail_model = "student_t",
                        family = "lognormal",
                        max_theta = 10,
                        share_shape = TRUE,
                        nu_prior = c(2,10),
                        beta_prior = c(2,1)) {

  dist <- c("gaussian", "poisson", "negbin", "binomial", "lognormal")
  fam <- match(family, dist)
  if (is.na(fam)) {
    stop("Make sure the entered family is in the list of accepted distributions")
  }

  tail <- c("gaussian", "student_t", "gnorm")
  tailmod <- match(tail_model, tail)
  if (is.na(tailmod)) {
    stop("Make sure the entered tail model is in the list of accepted distributions")
  }

  # check to make sure year and date are numeric
  if (!is.numeric(data[, time])) {
    stop("The time variable in the data frame (e.g. year) needs to be numeric")
  }
  if (is.numeric(data[, date])) {
    if (max(data[, date], na.rm = T) > 365) stop("The date variable in the data frame contains values greater than 365")
    if (min(data[, date], na.rm = T) < 1) stop("The date variable in the data frame contains values less than 1")
  } else {
    stop("The date variable in the data frame (e.g. day_of_year) needs to be numeric")
  }

  # optional priors
  use_t_prior = TRUE
  if (length(nu_prior) != 2) {
    if(is.na(nu_prior)) {
      use_t_prior = FALSE
    } else {
      stop("The nu prior must be a numeric 2-element vector or NA")
    }
  }
  use_beta_prior = TRUE
  if (length(beta_prior) != 2) {
    if(is.na(beta_prior)) {
      use_beta_prior = FALSE
    } else {
      stop("The beta prior must be a numeric 2-element vector or NA")
    }
  }

  # if 1 level, turn off trend and random effect estimation
  if (length(unique(as.numeric(data[, time]))) == 1) {
    est_sigma_re <- FALSE
    est_mu_re <- FALSE
  }

  # drop rows below threshold or NAs
  drop_rows <- which(is.na(data[, variable]) | data[, variable] < min_number)
  if (length(drop_rows) > 0) data <- data[-drop_rows, ]

  # rescale year variable to start at 1 for indexing
  data$year <- data[, time] - min(data[, time]) + 1

  # parse formulas. covar_data contains covariates specific to each time step
  if (is.null(covar_data)) {
    covar_data <- data.frame(year = unique(data$year))
  }
  mu_mat <- model.matrix(mu, data = covar_data)
  sig_mat <- model.matrix(sigma, data = covar_data)

  data_list <- list(
    y = data[, variable],
    yint = round(data[, variable]),
    years = as.numeric(as.factor(data$year)),
    x = data[, date],
    year_levels = as.numeric(as.factor(unique(data$year))),
    unique_years = unique(data$year),
    nLevels = length(unique(data$year)),
    asymmetric = as.numeric(asymmetric_model),
    family = fam,
    mu_mat = mu_mat,
    sig_mat = sig_mat,
    tail_model = as.numeric(tailmod) - 1,
    est_sigma_re = as.numeric(est_sigma_re),
    est_mu_re = as.numeric(est_mu_re),
    max_theta = max_theta,
    share_shape = as.numeric(share_shape),
    use_t_prior = as.numeric(use_t_prior),
    use_beta_prior = as.numeric(use_beta_prior),
    beta_prior = beta_prior,
    nu_prior = nu_prior
  )

  return(data_list)
}
ericward-noaa/phenomix documentation built on May 6, 2024, 10:20 a.m.