R/mm_model_by_ply.R

Defines functions mm_model_by_ply

Documented in mm_model_by_ply

#' Split and label data into >=24-hr days for fitting daily metabolism
#'
#' Splits up to two data.frames, data and data_daily, into date-specific chunks.
#' These are passed to model_fun. If \code{day_tests} is not empty, those
#' validity checks are run and the results are also passed to model_fun (in
#' \code{validity}). The results of model_fun (which must be a data.frame) are
#' modified to include the data as a first column, then row-bound together into
#' a single data.frame containing results from all days.
#'
#' @param model_fun the function to apply to each data ply. This function should
#'   accept the arguments \code{c(data, data_daily, ..., day_start, day_end,
#'   ply_date)} where \code{data_daily} is \code{NULL} when the
#'   \code{data_daily} argument to \code{mm_model_by_ply} is missing or
#'   \code{NULL}
#' @param data required. A data.frame to split into chunks by date, where a
#'   'date' begins on the hour day_start and ends at the hour day_end. The
#'   solar.time column must be present.
#' @param data_daily optional. A data.frame containing inputs with a daily
#'   timestep, each row of which will be passed to the corresponding date chunk
#'   from \code{data}. The date column must be present.
#' @param day_start start time (inclusive) of a day's data in number of hours
#'   from the midnight that begins the date. For example, day_start=-1.5
#'   indicates that data describing 2006-06-26 begin at 2006-06-25 22:30, or at
#'   the first observation time that occurs after that time if day_start doesn't
#'   fall exactly on an observation time. For metabolism models working with
#'   single days of input data, it is conventional/useful to begin the day the
#'   evening before, e.g., -1.5, and to end just before the next sunrise, e.g.,
#'   30. For multiple consecutive days, it may make the most sense to start just
#'   before sunrise (e.g., 4) and to end 24 hours later. For nighttime
#'   regression, the date assigned to a chunk of data should be the date whose
#'   evening contains the data. The default is therefore 12 to 36 for
#'   metab_night, of which the times of darkness will be used.
#' @param day_end end time (exclusive) of a day's data in number of hours from
#'   the midnight that begins the date. For example, day_end=30 indicates that
#'   data describing 2006-06-26 end at the last observation time that occurs
#'   before 2006-06-27 06:00. See day_start for recommended start and end times.
#' @param day_tests list of tests to conduct to determine whether each date
#'   worth of data is valid for modeling. The results of these tests will be
#'   combined with the result of the test implied if \code{required_timestep} is
#'   numeric and then will be passed to \code{model_fun} as the
#'   \code{ply_validity} argument to that function.
#' @param required_timestep NA or numeric (length 1). If numeric, the timestep
#'   length in days that a date must have to pass the validity check (to within
#'   a tolerance of 0.2\% of the value of \code{required_timestep}). The result
#'   of this test will be combined with the results of the tests listed in
#'   \code{day_tests} and reported to \code{model_fun} as the
#'   \code{ply_validity} argument to that function.
#' @param timestep_days TRUE if you would like the mean timestep length to be
#'   calculated for each data ply and passed to \code{model_fun} as the
#'   \code{timestep_days} argument to that function. Alternatively, this may be
#'   numeric as a specifically expected timestep length in days; for example, a
#'   1-hour timestep is 1/24 is 0.0416667.
#' @param ... other args to be passed through mm_model_by_ply to model_fun
#' @return a data.frame of model results
#' @import dplyr
#' @import tibble
#' @examples
#' dat <- data_metab('10')
#' mm_model_by_ply(mm_model_by_ply_prototype, data=dat, day_start=2, day_end=28)$date
#' mm_model_by_ply(function(...) { data.frame(text='hi')},
#'   data=dat, day_start=2, day_end=28)
#' @export
mm_model_by_ply <- function(
  model_fun, data, data_daily=NULL, day_start, day_end,
  day_tests=c('full_day', 'even_timesteps', 'complete_data', 'pos_discharge', 'pos_depth'), required_timestep=NA,
  timestep_days=TRUE, ...
) {

  # avoid some ugly edge cases
  if(missing(day_start) || is.null(day_start)) stop('day_start must be specified')
  if(missing(day_end) || is.null(day_end)) stop('day_end must be specified')
  if(day_end < day_start) stop("day_end must be greater than or equal to day_start")
  if(day_end - day_start > 48) stop("day_end - day_start must not be > 48") # would break our odd/even algorithm
  if(-24 >= day_start || day_start >= 24) stop("day_start must be in (-24,24)")
  if(0 >= day_end || day_end >= 48) stop("day_end must be in (0,48)")

  # Identify the data plys that will let us use a user-specified-hr window for
  # each date (day_start to day_end, which may be != 24). store this labeling in
  # two additional columns (odd.- and even.- date.groups)
  data.plys <- as.data.frame(v(data))
  if(!('solar.time' %in% names(data.plys))) stop("data must contain a 'solar.time' column")
  if(any(is.na(data.plys$solar.time))) stop("no values in solar.time may be NA")
  min_timestep <- mm_get_timestep(data$solar.time, format='unique')[1]
  if(length(min_timestep) == 1 && !is.na(min_timestep)[1] && min_timestep <= 0) {
    timesteps <- as.numeric(diff(v(data$solar.time)), units="days")
    timegoof <- which.min(timesteps) + c(0,1)
    stop("min timestep is <= 0: ", format(min_timestep, digits=3), " days from ",
         v(data$solar.time)[timegoof[1]], " (row ", timegoof[1], ") to ",
         v(data$solar.time)[timegoof[2]], " (row ", timegoof[2], ")")
  }
  if(!is.null(data_daily)) {
    if(!('date' %in% names(data_daily))) stop("data_daily must contain a 'date' column")
    min_datestep <- mm_get_timestep(data_daily$date, format='unique')
    if(length(min_datestep) > 0 && !is.na(min_datestep)[1] && min_datestep[1] <= 0) {
      timesteps <- as.numeric(diff(v(data_daily$date)), units="days")
      timegoof <- which.min(timesteps) + c(0,1)
      stop("min datestep is <= 0: ", min_datestep, " days from ",
           v(data_daily$date)[timegoof[1]], " (row ", timegoof[1], ") to ",
           v(data_daily$date)[timegoof[2]], " (row ", timegoof[2], ")")
    }
  }
  doyhr <- convert_date_to_doyhr(data.plys$solar.time)
  data.plys$date <- as.character(as.Date(lubridate::floor_date(data.plys$solar.time, 'year')) + as.difftime(floor(doyhr)-1, units='days'))
  data.plys$hour <- 24*(doyhr %% 1)

  # subtract a tiny fudge factor (one second) to the start and end bounds so we
  # don't exclude times that are essentially equal to day_start, and so we do
  # exclude times that are essentially equal to day_end
  day_start_fudge <- day_start - 1/(60*60*24)
  day_end_fudge <- day_end - 1/(60*60*24)

  # This function speeds things up in both of the next two loops. 'runs' refers
  # to sequential runs of values in date.group that have the same value.
  get_runs <- function(date.group) {
    values <- start <- end <- '.dplyr.var'
    replace(date.group, is.na(date.group), '') %>%
      rle() %>%
      unclass %>%
      as_tibble %>%
      mutate(end=cumsum(lengths), start=c(1, 1+end[-n()])) %>%
      filter(values != '') %>%
      select(date=values, start, end)
  }
  # Determine which rows go with which date, and label dates as either odd or
  # even (no relation to the actual value of the date)
  date_rows <- get_runs(data.plys$date)
  odd.dates <- date_rows[which(seq_len(nrow(date_rows)) %% 2 == 1),]$date
  even.dates <- date_rows[which(seq_len(nrow(date_rows)) %% 2 == 0),]$date

  # Assign each data row to one or two date plys. Because date plys can overlap,
  # there are two columns so that one row can belong to two dates if needed.
  dt.NA <- as.character(NA)
  if(nrow(data.plys) == 0) {
    data.plys <- bind_cols(
      data.plys,
      tibble::tibble(odd.date.group=dt.NA, even.date.group=dt.NA)[c(),])
    out_list <- list()
  } else {
    data.plys <- bind_cols(
      data.plys,
      bind_rows(lapply(seq_len(nrow(date_rows)), function(dt) {
        run <- date_rows[dt,]
        dt.today <- run$date
        dt.yesterday <- if(dt>1) date_rows[[dt-1,'date']] else dt.NA
        dt.tomorrow <- if(dt<nrow(date_rows)) date_rows[[dt+1,'date']] else dt.NA
        data.plys.rows <- run$start:run$end
        hr <- data.plys[data.plys.rows, 'hour']
        primary.date <- c(dt.today, dt.NA)[ifelse(hr >= day_start_fudge & hr < day_end_fudge, 1, 2)]
        secondary.date <- c(dt.yesterday, dt.NA, dt.tomorrow)[ifelse(hr <= (day_end_fudge - 24), 1, ifelse(hr < (24 + day_start_fudge), 2, 3))]
        if(dt.today %in% odd.dates) {
          tibble(odd.date.group=primary.date, even.date.group=secondary.date)
          #data.plys[data.plys.rows, c('odd.date.group','even.date.group')] <- c(primary.date, secondary.date)
        } else { # dt.today %in% even.dates
          tibble(odd.date.group=secondary.date, even.date.group=primary.date)
          #data.plys[data.plys.rows, c('odd.date.group','even.date.group')] <- c(secondary.date, primary.date)
        }
      }))) %>%
      as.data.frame(stringsAsFactors=FALSE)

    # filter out dates that don't ever appear on their own - this especially weeds
    # out first and last dates that were probably meant just to be part of the
    # second or penultimate dates when the date range is >24
    date.pairings <- setNames(unique(data.plys[, c('odd.date.group','even.date.group')]), c('a','b'))
    date.pairings <- rbind(date.pairings, setNames(date.pairings, c('b','a')))
    date.pairings <- date.pairings[!is.na(date.pairings$a),]
    solo.dates <- date.pairings[is.na(date.pairings$b),]$a
    tbl.dates <- table(date.pairings$a)
    double.dates <- names(tbl.dates)[tbl.dates > 1]
    unique.dates <- sort(unique(c(solo.dates, double.dates)))

    # Apply model_fun to each ply of the data, using if-else in the lapply loop to
    # cover both the odd and even groupings in date order
    runs <- bind_rows(
      get_runs(data.plys$odd.date.group),
      get_runs(data.plys$even.date.group)) %>%
      filter(date %in% unique.dates) %>%
      arrange(date) %>%
      mutate(date=as.Date(date, tz=tz(data$solar.time)))
    hour <- odd.date.group <- even.date.group <- '.dplyr.var'
    data_plys <- data.plys %>%
      select(-date, -hour, -odd.date.group, -even.date.group)
    out_list <- lapply(seq_along(runs$date), function(dt) {
      # pick out the inst & daily plys for this date
      run <- runs[dt,]
      data_ply <- data_plys[run$start:run$end,]
      data_daily_ply <-
        if(!is.null(data_daily)) {
          data_daily[match(run$date, data_daily$date),]
        } else {
          NULL
        }
      ply_date <- run$date

      # compute timestep and run validity checks if requested. if timestep_days is
      # FALSE or NA, it will be passed as NA to the model_fun and only computed
      # there if needed for specific tests
      if(length(timestep_days) > 1) stop("expecting no more than 1 value in timestep_days")
      timestep_days <- if(isTRUE(timestep_days)) {
        mm_get_timestep(data_ply$solar.time, format='mean')
      } else if(is.na(timestep_days) || is.null(timestep_days) || timestep_days==FALSE) {
        NA
      } else timestep_days
      ply_validity <- mm_is_valid_day(
        data_ply=data_ply, day_start=day_start, day_end=day_end, day_tests=day_tests, required_timestep=required_timestep,
        ply_date=ply_date, timestep_days=timestep_days)

      # run the user's model_fun
      out <- model_fun(
        data_ply=data_ply, data_daily_ply=data_daily_ply,
        day_start=day_start, day_end=day_end, ply_date=ply_date,
        ply_validity=ply_validity, timestep_days=timestep_days,
        ...)
      # attach a date column if anything was returned
      if(is.null(out) || nrow(out)==0) NULL else data.frame(date=ply_date, out)
    })
  }

  # if out_list came back with nothing, give the model_fun one more chance to
  # suggest column names
  if(length(out_list) == 0 || all(sapply(out_list, is.null))) {
    out_list <- tryCatch(
      list(
        model_fun(
          data_ply=data[c(),], data_daily_ply=data_daily[c(),],
          day_start=day_start, day_end=day_end, ply_date=as.Date(NA),
          ply_validity=NA, timestep_days=NA,
          ...) %>%
          mutate(date=as.Date(NA)) %>%
          select(date, everything())),
      error=function(e) {
        # and if it REALLY doesn't work, just return an empty 1-column df
        data.frame(date=as.Date(NA))[c(),]
      }
    )
  }

  # combine the 1-row dfs, making sure to start with a 0-row df that represents
  # the dfs with the most columns to keep the column ordering consistent across
  # runs. dplyr::bind_rows should take care of any missing columns, since these
  # are matched by name, and missing columns are filled with NAs
  if(length(out_list) == 0) {

  } else {
    example_choice <-
      sapply(out_list, function(out) { if(is.null(out)) 0 else ncol(out) }) %>%
      which.max()
    out_example <- out_list[[example_choice]][FALSE,]
    bind_rows(c(list(out_example), out_list)) %>% as.data.frame()
  }
}
USGS-R/streamMetabolizer documentation built on Aug. 15, 2023, 7:50 a.m.