swimctr: Oregon SWIM v2.5 Commercial Transport (CT) Model

Documented in local_truck_generation

#' Generate local truck trips (tour segments) from pseudo-firms
#'
#' @param synthetic_firms Data frame containing pseudo-firms and their
#'   typical attributes
#' @param generation_probabilities Data frame containing trip generation rates
#'   expressed as probabilities, for each trip generation category used in the
#'   model
#' @param random_seed Integer value of random number seed set at outset of
#'   sampling of trips generated by pseudo-firms (defaults to 1)
#' @param max_resampling_attempts Integer value that controls the number of
#'   times the generation sampling will repeat if it does not finish with same
#'   number of trips calculated in aggregate for the entire study area. If this
#'   number is exceeded then the model will error. Alternatively if `1` then
#'   resampling will be turned off.
#' @param max_resampling_threshold Floating point number defining how close
#'   the simulation must match aggregate number of trips for the entire
#'   modeled area (default is 0.5 percent)
#' @param save_to File name for saving the trip records with destinations
#'   appended in comma-separated value format (optional)
#'
#' @details The trip generation function samples the number of discrete daily
#'   trips generated by each pseudo-firm, and appends that number to each
#'   synthetic firm. The mode of transport used for each trip is also sampled
#'   and added to the trip records.
#'
#' @export
#' @examples
#' daily_trips <- local_truck_generation(synthetic_firms,
#'   generation_probabilities)
#' daily_trips <- local_truck_generation(synthetic_firms,
#'   generation_probabilities, random_seed = 1, max_resampling_attempts = 30,
#'   max_resampling_threshold = 0.5, "local-truck-origins.csv")

local_truck_generation <- function(synthetic_firms, generation_probabilities,
  random_seed = 1, max_resampling_attempts = 30, max_resampling_threshold = 0.5,
  save_to = NULL) {

  ct_msg(header = "Local truck trip generation")
  set.seed(as.integer(random_seed))

  # Reformat the generation probabilities into tall records that we can merge
  # with our firms, dropping cases where probabilities are zero
  probabilities <- generation_probabilities %>%
    tidyr::gather("truck_type", "p_gen", -category) %>%
    dplyr::filter(p_gen>0.0)

  # Start by showing the aggregate trip (AT) generation using these values,
  # which we'll compare the sum of microsimulated values to in calibration.
  AT <- synthetic_firms %>%
    dplyr::group_by(category) %>%
    dplyr::summarise(employees = sum(employees))
  truck_types <- sort(unique(probabilities$truck_type))
  aggregate_results <- dplyr::data_frame()
  for (t in truck_types) {
    p <- probabilities %>%
      dplyr::filter(truck_type==t) %>%
      dplyr::inner_join(AT, by = "category") %>%
      dplyr::mutate(target = round(employees*p_gen, 1)) %>%
      dplyr::select(-p_gen, -employees)
    aggregate_results <- dplyr::bind_rows(aggregate_results, p)
  }
  x <- addmargins(xtabs(target~category+truck_type, data = aggregate_results))
  ct_msg(x)

  # Finally, let's do the generation. We'll step through this process for each
  # truck type defined in the trip generation probabilities.
  results <- dplyr::data_frame()   # Container to hold the generated trips
  for (t in truck_types) {
    # Create a subset of the probabilities for this truck type, which we'll
    # merge with the alpha zone data. We use an inner join to ensure that we
    # are only including firms that we have non-zero rates for.
    these_probabilities <- dplyr::filter(probabilities, truck_type==t)
    these_firms <- dplyr::inner_join(synthetic_firms, these_probabilities,
      by = "category")

    # Calculate the total daily trips generated by each firm and sum them for
    # a target total. We will require the sum of the microsimulated trips to
    # not deviate more than a user-specified difference from this initial
    # So we will loop until we fall within that window.
    success <- FALSE
    stage <- ""
    trials <- 1

    # If we have a small enough number of total trucks of this type it might be
    # hard to get under the maximum desirable error, even though we want it that
    # low for the more frequently occurring types. So we'll keep track of the
    # "best solution so far" so that we can retain that if we do not converge
    # before we hit the maximum number of resampling attempts.
    best_solution <- NULL
    best_difference <- 999.9  # An easy bar to pass beneath

    while (!success) {
      # Check to make sure that we haven't exceeded the maximum number of
      # attempts at replanning. If we have something pathological has happened
      # and we need to stop dead in our tracks.
      if (trials>as.integer(max_resampling_attempts)) {
        these_firms$daily_trips <- best_solution
        warning_message <- paste("Max resampling exceeded while working on", t,
          "generation, set to best solution")
        warning(warning_message)
        break
      }

      # Calculate our target
      these_firms$daily_trips <- these_firms$employees*these_firms$p_gen
      initial_total <- round(sum(these_firms$daily_trips), 1)

      # Generate discrete number of trucks for each firm. Use random draws to
      # decide what to do with the fractional part of the trips.
      pw <- trunc(these_firms$daily_trips)   # Retain the whole part
      fp <- these_firms$daily_trips-pw    # Retain the fractional part
      rn <- runif(length(fp), 0, 1)   # Random draws to compare to
      these_firms$daily_trips <- ifelse(rn<=fp, pw+1, pw)
      adjusted_total <- sum(these_firms$daily_trips)
      pct_difference<-((adjusted_total-initial_total)/initial_total)*100.0

      # Store the best solution so far, in case we don't converge before we run
      # out of resampling attempts
      if (pct_difference < best_difference) {
        best_difference <- pct_difference
        best_solution <- these_firms$daily_trips
      }

      # Show us how we did
      ct_msg(paste(stage, t, " origins: combined=", initial_total,
        " simulated=", adjusted_total, " difference=",
        round(pct_difference,2), "%", sep=''))

      if(max_resampling_attempts > 0){  # only resample if asked
        # Evaluate success
        success <- abs(pct_difference) <= as.numeric(max_resampling_threshold)
        stage <- "Resampling "
        trials <- trials+1
      } else {
        break
      }
    }

    # For now we'll combine the results for this truck type with all of the
    # others processed. At this point we're appended the number of daily
    # truck trips by truck type to each record of the firm data.
    p <- dplyr::filter(these_firms, daily_trips>0)
    results <- dplyr::bind_rows(results, p)
  }

  # Create a list that repeats the firmID by the number of daily trips.
  # Because each firm can generate more than one type of truck we have one or
  # more trips from the same firm. We'll need to use a merge key that handles
  # both firmID and truck type.
  origins <- rep(results$firmID, results$daily_trips)
  trucks <- rep(results$truck_type, results$daily_trips)
  trip_records <- dplyr::data_frame(firmID = as.integer(origins),
    truck_type = as.character(trucks))

  # Now we simply merge this with the input data table, ensuring that all of
  # the origins get matched with the firm's information. We'll drop the info
  # that we don't need in subsequent models.
  truck_origins <- trip_records %>%
    dplyr::left_join(results, by=c("firmID", "truck_type")) %>%
    dplyr::select(-p_gen, -daily_trips) %>%
    dplyr::mutate(taz = as.integer(taz), employees = as.integer(employees),
      origin = taz)

  # If the user has specified they want to store these intermediate results in
  # a text file then do so
  if (!is.null(save_to)) readr::write_csv(truck_origins, save_to)

  # Compare the initial aggregate results with the microsimulated ones, summed
  # by trip generation categories. The results are for information only, for if
  # we got past the resampling threshold earlier we're good.
  combined <- truck_origins %>%
    dplyr::group_by(category, truck_type) %>%
    dplyr::summarise(simulated = n()) %>%
    dplyr::right_join(aggregate_results, by = c("category", "truck_type"))
  outliers <- combined %>%
    dplyr::mutate(pct_difference = ((simulated-target)/target)*100) %>%
    dplyr::filter(pct_difference > max_resampling_threshold)
  if (nrow(outliers)>0) {
    ct_msg("Cases where total simulated trips exceed target thresholds:")
    ct_msg(outliers)
  }

  # Return the list of daily truck origins
  ct_msg(paste(nrow(truck_origins), "daily local truck origins generated"))
  truck_origins
}