R/scenario_selection.R

Defines functions scenario_selection

Documented in scenario_selection

#' A function for scenario (event or ground motion map) selection
#'
#' This function takes the target hazard (and target hazard magnitude and/or distance distributions)
#' and the hazard (and hazard magnitude and/or distance distributions) produced by each of the
#' considered event scenarios or ground motion map scenarios to efficiently and
#' optimally select a series of subset scenarios with adjusted annual
#' occurrence rates by using LASSO regression.
#'
#' @param Y The target hazard and target hazard magnitude/distance distributions
#' @param X The hazard matrix produced by each of the considered scenarios
#' @param min_hazard The minimum hazard value in \code{Y} that is considered in regression.
#' This is used to avoid having extremely large weights. Note the weights is 1/\code{Y}.
#' The default value for \code{min_hazard} is 1e-7.
#' @param output_dir If not NULL, the results will be written into a few CSV files under the specified directory
#' @param max_rate_multiplier The maximum value for the multiplier of annual occurrence rate when running LASSO,
#' a positive numeric number. It is an additional constraint for scenario selection.
#' The rate multipliers are not straightly constrained: the rate multipliers could be much larger
#' when the number of selected scenarios is too small; and the rate multipliers will get better
#' constrained when the number of selected scenarios gets more.
#' @param num_lambda The maximum number of trail penalty parameters. A larger \code{num_lambda}
#' will search for more possible subset events.
#' @param Weight The weight vector for each element in \code{Y}. The default is 1 that indicates
#' using the default weight of scenario selection algorithm (which is 1/\code{Y}). If the other
#' weights are preferred, then a vector with the same length as \code{Y} for the desired weights
#' can be specified. Since the weight of 1/\code{Y} has been set in the function, so you'd need
#' to multiply your desired weights by \code{Y} to compensate the default weights.
#'
#' @return A list of three elements is returned. The first element is
#' the corresponding adjusted rate multipliers for each event
#' (or each column in \code{X}), the second element is recovered hazard by the selected subset events,
#' and the last element is the logarithmic mean squared error.
#' The events with positive factors are selected events and the events with factors = 0 are
#' NOT selected.
#' \item{betas}{A matrix of the adjusted rate multipliers for each scenario for each set of selected
#' scenarios. The dimension is total number of candidate scenarios by the number of set of selected
#' scenarios. The scenarios with zero multipliers are unselected}
#' \item{errors_mat}{A matrix with four columns: the number of selected scenarios following by
#' three error metrics (mean of squared logarithmic residuals, mean of absolute logarithmic
#' relative errors, and mean of absolute arithmetic relative errors)}
#' \item{Y_hat}{The predicted hazard by each set of selected scenarios}
#'
#' @importFrom glmnet glmnet
#' @importFrom stats median
#' @importFrom utils txtProgressBar
#' @importFrom utils setTxtProgressBar
#'
#' @export
scenario_selection <- function(Y, X, min_hazard = 1e-7, output_dir = NULL, max_rate_multiplier = NULL,
                               num_lambda = 1000, Weight = 1) {

  Weight <- (1 / Y) * Weight

  if (is.numeric(min_hazard) & (min_hazard >= 0)) {
    Idx_include <- which(Y >= min_hazard)  # only consider the hazards with more than min_hazard
  } else {
    Idx_include <- seq(1, length(Y))
  }

  # run LASSO
  if (is.null(max_rate_multiplier)) {
    cat('Conduct selection \n')
    lasso_res <- glmnet(x = X[Idx_include, ], y = Y[Idx_include],
                        weights = Weight[Idx_include],
                        lambda.min.ratio = 0, nlambda = num_lambda,
                        lower.limits = 0, intercept = F, trace.it = TRUE)

    lasso_res <- relax.glmnet(fit = lasso_res,
                              x = X[Idx_include, ], y = Y[Idx_include],
                              weights = Weight[Idx_include],
                              lambda.min.ratio = 0, nlambda = num_lambda,
                              lower.limits = 0, intercept = F,
                              trace.it = FALSE)
  } else if (max_rate_multiplier > 0) {
    cat('Conduct selection \n')
    lasso_res <- glmnet(x = X[Idx_include, ], y = Y[Idx_include],
                        weights = Weight[Idx_include],
                        lambda.min.ratio = 0, nlambda = num_lambda,
                        lower.limits = 0, intercept = F, trace.it = TRUE,
                        upper.limits = max_rate_multiplier)

    lasso_res <- relax.glmnet(fit = lasso_res,
                              x = X[Idx_include, ], y = Y[Idx_include],
                              weights = Weight[Idx_include],
                              lambda.min.ratio = 0, nlambda = num_lambda,
                              lower.limits = 0, intercept = F,
                              trace.it = FALSE,
                              upper.limits = max_rate_multiplier)
  } else {
    stop('max_rate_multiplier is not valid, please correct the input!')
  }

  # get the series of selected event sets
  num_increasing <- apply(lasso_res$relaxed$beta, 2, function(x) sum(x > 0))
  uni_num <- sort(unique(num_increasing[num_increasing > 0]))
  idx <- as.numeric(sapply(uni_num, function(x) {
    # find the index such that the corresponding dev.ratio is max.
    tmp_idx <- which(num_increasing == x)
    tmp_idx[which.max(lasso_res$relaxed$dev.ratio[tmp_idx])]
  }))

  m_sq_logerr <- rep(0, length(idx))
  m_abs_log_rediff <- rep(0, length(idx))
  m_abs_rediff <- rep(0, length(idx))
  Y_hat <- matrix(data = 0, nrow = length(Y), ncol = length(idx))
  betas_mat <- matrix(data = 0, nrow = ncol(X), ncol = length(idx))

  cat('Prepare and output results \n')
  pb <- txtProgressBar(min = 0, max = length(idx), initial = 0,
                       style = 3)

  for (i in 1:length(idx)) {
    betas <- lasso_res$relaxed$beta[, idx[i]]
    betas_mat[, i] <- betas

    m_sq_logerr[i] <- mean((log(Y[Idx_include]) -
                              ifelse(X[Idx_include, ] %*% betas == 0,
                                     0,
                                     log(X[Idx_include, ] %*% betas)))^2)
    m_abs_log_rediff[i] <- mean(abs((log(Y[Idx_include]) -
                                       ifelse(X[Idx_include, ] %*% betas == 0,
                                              0,
                                              log(X[Idx_include, ] %*% betas)))/log(Y[Idx_include])))
    m_abs_rediff[i] <- mean(abs((Y[Idx_include] - X[Idx_include, ] %*% betas)/Y[Idx_include]))
    Y_hat[, i] <- X %*% betas

    setTxtProgressBar(pb, i)
  }

  colnames(betas_mat) <- paste0('selected_scenario_num_', uni_num)
  rownames(betas_mat) <- paste0('rate_multiplier_scenario_', 1:ncol(X))
  errors_mat <- cbind(uni_num, m_sq_logerr, m_abs_log_rediff, m_abs_rediff)
  colnames(errors_mat) <- c('number_selected_scenario', 'mean_squared_log_error',
                            'mean_absolute_log_relative_difference',
                            'mean_absolute_relative_difference')
  colnames(Y_hat) <- paste0('selected_scenario_num_', uni_num)

  if (!is.null(output_dir)) {
    if (dir.exists(output_dir)) {
      write.csv(betas_mat, file = paste0(output_dir, '/betas_mat.csv'))
      write.csv(errors_mat, file = paste0(output_dir, '/errors_mat.csv'), row.names = F)
      write.csv(Y_hat, file = paste0(output_dir, '/Y_hat.csv'), row.names = F)
      cat('\n The results are exported! \n')
    } else {
      cat('\n The results are not written into any files, please input a valid the output_dir! \n')
    }
  } else {
    cat('\n The results are not written into any files. \n')
  }

  res <- list()
  res$betas <- betas_mat
  res$errors_mat <- errors_mat
  res$Y_hat <- Y_hat
  return(res)
}
wltcwpf/RPSHA documentation built on July 6, 2023, 1:02 a.m.