R/knn_space_time_disagg.R

Defines functions get_meta_data ym_labels full_year get_scale_factor knn_space_time_disagg

Documented in knn_space_time_disagg

#' Spatial and temporal disaggregation of flow data
#'
#' `knn_space_time_disagg()` disaggregates annual flow data spatially and
#' temporally (to monthly), using a spatial and monthly flow pattern selected
#' from an "index year". The index year is selected using a k nearest-neighbor
#' approach, (Knowak et al., 2010).
#'
#' The method is described in detail in *Knowak et al.* (2010). The methodology
#' disaggregates annual flow data (`ann_flow`) by selecting an index year from
#' `ann_index_flow` using [knn_get_index_year()]. After the index year is
#' selected, values from `ann_flow` are disaggregated spatially, and temporally
#' based on `mon_flow`. The spatial pattern is reflected by including different
#' sites as columns in `mon_flow`, and the monthly disaggregation, uses the
#' monthly pattern in `mon_flow` to disaggregate the data temporally.
#' Summability is preserved using this method, if the values selected in
#' `mon_flow` are scaled and if the columns (or a subset of columns) in
#' `mon_flow` sum together to equal `ann_index_flow`.
#'
#' `scale_sites`: In some cases, it is desirable to select monthly flow
#' directly, instead of
#' scaling it. This can be performed by only scaling certain sites, using
#' `scale_sites`. `scale_sites` should be a boolean, or a vector of numerics. If
#' `TRUE`, then all sites are scaled. If `FALSE`, all sites monthly values
#' are selected directly. Otherwise, `scale_sites` should be a vector of the
#' sites that should be scaled, based on their column index from `mon_flow`. For
#' example, if `mon_flow` is a matrix with 4 columns, then setting `mon_flow` to
#' `c(2, 3)` will scale the values in sites 2 and 3 (columns 2 and 3), while
#' selecting flow values directly in sites 1 and 2.
#'
#' @param mon_flow Monthly natural flow. Used for spatially and
#'   temporally disaggregating the flow data (`ann_flow`) based on the index
#'   year selected from `ann_index_flow`, by [knn_get_index_year()]. Each column
#'   represents a different site, and the annual flow at the index gage will be
#'   disaggregated to each of these sites at he monthly level. If there are
#'   three columns in this matrix, then the values in `ann_flow` will be
#'   disaggregated to three sites. `mon_flow` should have the same years as
#'   `ann_index_flow`, therefore, it should contain 12 times more rows than
#'   `ann_index_flow`. The flow data in `mon_flow` should also contain values
#'   for the same years as `ann_index_flow`, though there are no checks
#'   performed to check this, since this is expected to be a dimensionless
#'   matrix. `mon_flow` can have named or unnamed columns. If they are named,
#'   the names are preserved in the disaggregated output. If they are unnamed,
#'   the columns are renamed S1-SN where N is the number of columns.
#'
#' @param start_month The start month of the `mon_flow` as an integer. 1 =
#'   January, 2 = February, etc. Used to correctly label the output data.
#'
#' @param scale_sites `TRUE`/`FALSE` - scale all the sites. Otherwise, a numeric
#'   vectotr specifying the site numbers (column indices), that will scale the
#'   index year's volume based on the annual flow being disaggregated. The
#'   remaining sites will select the index year directly. See **Details**.
#'
#' @param nsim Number of times to repeat the space/time disaggregation.
#'
#' @param index_years Optional. If specified, these index years will be used
#'   instead of selecting years based on weighted sampling via
#'   [knn_get_index_year()].
#'
#' @return A [`knnst`] object.
#'
#' @inheritParams knn_get_index_year
#'
#' @author Ken Nowak
#'
#' @references Nowak, K., Prairie, J., Rajagopalan, B., Lall, U. (2010).
#'   A nonparametric stochastic approach for multisite disaggregation of
#'   annual to daily streamflow. *Water Resources Research.*
#'
#' @seealso [`knnst`], [knn_get_index_year()]
#'
#' @examples
#'
#' # a sample of three years of flow data
#' flow_mat <- cbind(c(2000, 2001, 2002), c(1400, 1567, 1325))
#' # made up historical data to use as index years
#' ind_flow <- cbind(1901:1980, rnorm(80, mean = 1500, sd = 300))
#' # make up monthly flow for two sites
#' mon_flow <- cbind(
#'   rnorm(80 * 12, mean = 20, sd = 5),
#'   rnorm(80 * 12, mean = 120, sd = 45)
#' )
#' knn_space_time_disagg(flow_mat, ind_flow, mon_flow, 1, scale_sites = 1:2)
#'
#' @export
knn_space_time_disagg <- function(ann_flow,
                         ann_index_flow,
                         mon_flow,
                         start_month,
                         nsim = 1,
                         scale_sites = TRUE,
                         index_years = NULL,
                         k_weights = knn_params_default(nrow(ann_index_flow)),
                         random_seed = NULL)
{
  n_disagg_yrs <- nrow(ann_flow)

  # how many yrs of observed mon_flow
  assert_that(
    nrow(mon_flow) %% 12 == 0,
    msg = "`mon_flow` needs to have an even year's worth of data"
  )

  n_obs_yrs <- nrow(mon_flow)/12

  assert_that(
    n_obs_yrs == nrow(ann_index_flow),
    msg = "`ann_index_flow` and `mon_flow` must have the same number of years."
  )

  assert_that(ncol(ann_index_flow) == 2)
  assert_that(ncol(ann_flow) == 2)

  assert_that(
    length(start_month) == 1 && is.numeric(start_month) && start_month %in% 1:12,
    msg = "`start_month` should be a single integer from 1 to 12"
  )

  assert_that(
    (is.logical(scale_sites) && length(scale_sites) == 1) ||
      (is.numeric(scale_sites) && length(scale_sites) > 0),
    msg = "`scale_sites` should be a boolean scaler or numeric vector."
  )

  nsite <- ncol(mon_flow)

  if (!is.null(index_years)) {
    assert_that(is.matrix(index_years))

    assert_that(
      ncol(index_years) == nsim,
      msg = paste(
        "`index_years` must be specified for all simulations.\n",
        " So, `nsim` must equal the number of columns in `index_years`.",
      )
    )

    assert_that(
      nrow(index_years) == n_disagg_yrs,
      msg =
        "`index_years` must be specified for every year in the paleo record."
    )
  }

  # set this once; do not pass it to knn_get_index_year, b/c then the multiple
  # simulations would not be any different from one another
  if (!is.null(random_seed)) {
    assert_that(is.numeric(random_seed) && length(random_seed) == 1)
    set.seed(random_seed)
  }

  if (!is.null(index_years)) {
    message(
      "`index_years` is specified, so `k_weights` will be ignored."
    )
  }

  if (is.numeric(scale_sites)) {
    assert_that(
      all(scale_sites %in% 1:nsite),
      msg = "All `scale_sites` must be withing the number of sites in `mon_flow`."
    )
  } else {
    if (scale_sites)
      scale_sites <- 1:nsite
    else
      scale_sites <- integer(0)
  }

  if (!all(1:nsite %in% scale_sites)) {
    # set ind_sites to the remaining sites
    # ind_sites are selected directly
    ind_sites <- 1:nsite
    ind_sites <- ind_sites[!(1:nsite %in% scale_sites)]
    message(
      ifelse(length(ind_sites) > 1, "Sites ", "Site "),
      toString(ind_sites), "\n",
      "will be selected directly from the index years, i.e., not scaled."
    )
  } else {
    ind_sites <- NULL
  }

  # make up site names if input is unnamed
  if (is.null(colnames(mon_flow)))
  {
    site_names <- paste0("S", 1:ncol(mon_flow))
    message(
      "`mon_flow` does not have colnames. Renaming to:\n",
      paste(site_names, collapse = ", ")
    )
    colnames(mon_flow) <- site_names
  }

  # matrix for observed values - row(yr), col (month), index (site)
  dat_a <- array(data = NA, dim=c(n_obs_yrs, 12, nsite))

  # matrix for disag values - row(yr), col (month), index1 (site), index2(sim #)
  disag <- array(data = NA, dim = c(n_disagg_yrs, 12, nsite, nsim))

  # matrix for recording yr index for disag (optional)
  index_mat <- matrix(ncol = nsim, nrow = n_disagg_yrs)

  # matrix for recording scale factors used (optional)
  sf_mat <- matrix(ncol = nsim, nrow = n_disagg_yrs)

  # this loop moves observed monthly mon_flow from 2d matrix to 3d array
  mgn <- length(dat_a[1,1,])

  for (j in seq_len(mgn)) {

    s <- 1
    e <- 12

    for (i in seq_len(n_obs_yrs)) {

      dat_a[i, , j] <- mon_flow[s:e, j]

      s <- s + 12
      e <- e + 12
    }
  }

  # loop through the number of simulations ---------------------
  for(j in seq_len(nsim)){

    # this picks the 1st year for disag based only on the annual flow

    if (is.null(index_years)) {
      ind_yrs <- knn_get_index_year(ann_flow, ann_index_flow, k_weights)
      index_mat[, j] <- ind_yrs[, 1]
    } else {
      ind_yrs <- index_years[, j, drop = FALSE]
      index_mat[, j] <- ind_yrs[, 1]
    }

    # loop through all years that need disaggregated ---------------
    for(h in seq_len(n_disagg_yrs)) {

      # select the index year and scaling factor
      index_atts <- get_scale_factor(
        ind_yrs[h, 1],
        ann_flow[h, 2],
        ann_index_flow
      )

      sf_mat[h, j] <- index_atts$SF

      disag[h, , scale_sites, j] <- dat_a[index_atts$pos, , scale_sites] *
        index_atts$SF
      disag[h, , ind_sites, j] <- dat_a[index_atts$pos, , ind_sites]
    }
  }

  # create rownames for yyyy-mm
  yy <- ym_labels(ann_flow[,1], start_month)
  site_names <- colnames(mon_flow)

  # convert from 4-d array to list of 2-d arrays
  disag_out <- lapply(seq_len(nsim), function(ii) {
    disagg_flow <- do.call(
      cbind,
      lapply(seq_len(nsite), function(jj) as.vector(t(disag[,,jj,ii])))
    )
    rownames(disagg_flow) <- yy
    colnames(disagg_flow) <- site_names
    list(
      disagg_flow = disagg_flow,
      index_years = index_mat[,ii]
    )
  })

  # add in the historical data and index gage data, and move all the simulation
  # data in one layer.
  meta <- get_meta_data()
  meta[["call"]] <- deparse(sys.call())

  disag_out <- list(
    disagg_sims = disag_out,
    index_data = ann_index_flow,
    mon_flow = mon_flow,
    start_month = start_month,
    meta = meta
  )

  disag_out <- structure(disag_out, class = c("knnst", "list"))
  validate_knnst(disag_out)

  disag_out
}

#' Compute the scaling factor for the current year's flow from the index year
#'
#' Selects the magnitude of flow from `ann_index_flow` for the `index_year`.
#' Then scaling factor is computed as `flow` / `ann_index_flow[index_year]`.
#'
#' @param index_year The index year to select from ann_index_flow
#' @param flow The current flow to disaggregate (length == 1)
#' @param ann_index_flow Matrix of index years and flow. n x 2 matrix. First
#'   column is years.
#'
#' @noRd
#'
get_scale_factor <- function(index_year, flow, ann_index_flow)
{
  assert_that(length(index_year) == 1)
  assert_that(length(flow) == 1)
  assert_that(ncol(ann_index_flow) == 2)
  assert_that(index_year %in% ann_index_flow[,1])

  # index for selected yr
  pos <- match(index_year, ann_index_flow[,1])

  # scaling factor to apply for disag
  SF <- flow/(ann_index_flow[pos, 2])

  list(pos = pos, SF = SF)
}

# given a start month, returns a single year of months
full_year <- function(start_month)
{
  x <- rep(1:12, 2)
  x <- x[start_month:(start_month + 11)]

  x
}

# creates row name labels
# returns in yyyy-mm format
ym_labels <- function(years, start_month)
{
  # adjust the years for start_month not == January, because for example, a
  # start_monnth of October actually belongs to the previous year.
  # assumes years correspond with last month in year, not first month in year
  yy <- expand.grid(full_year(start_month), years)
  mm <- yy[,1]
  yy <- yy[,2]
  if (start_month != 1)
    yy[mm %in% start_month:12] <-  yy[mm %in% start_month:12] - 1

  yy <- paste(
    yy,
    formatC(mm, width = 2, format = "d", flag = "0"),
    sep = "-"
  )

  yy
}

get_meta_data <- function()
{
  list(
    user = Sys.info()[["user"]],
    date = Sys.time(),
    version = paste0("knnstdisagg v", utils::packageVersion("knnstdisagg"))
  )
}
rabutler-usbr/knnstdisagg documentation built on Sept. 14, 2023, 2:47 p.m.