spflow: Spatial Econometric Interaction Models

Documented in derive_spflow_matrices

#' @title Compute the design matrices used during the estimation with [spflow()]
#'
#' @description
#' These are internal functions called within the estimation procedure
#'
#' @details
#' The key to an efficient estimation is to preserve the relational
#' representation of the data for origins, destinations and
#' origins-destinations pairs.
#' This requires to be aware of the following;
#'  - there are up to three sources of data: OD-pairs & origin-nodes & destination-nodes
#'  - each variable may be used in three different ways: directly & as a spatial lag
#'   (like in the SDM) & as an instrument for the S2SLS estimator
#'  - the model matrices can be split into five groups
#'      1. "Y_" = OD-flows (the dependent variable, in matrix form)
#'      2. "P_" = OD-pair attributes (in matrix form)
#'      3. "D_" = destination attributes
#'      4. "O_" = origin attributes
#'      5. "I_" = intra-regional attributes
#'
#' The additional separation of model matrices and data-sources makes sense if
#' the list of origins coincides with the list of destinations.
#' In this case, we can use data from the same set of nodes as origin,
#' destination, and intra-regional characteristics and each of these enter
#' the model in different ways.
#'
#' The model formula interface in [spflow()] is used to specify, how the variables
#' in the data-sources are used.
#' Any transformations of variables are handled by  R's build-in tools and
#' spatial lags, that are specified in the augments `sdm_variables` and
#' `twosls_instrumental_variables` to [spflow_control()] are calculated after
#' transformations have been applied.
#' Below is an explanation of the formula parts:
#'  - "norm" variables are not lagged
#'  - "sdm" variables are lagged once and used as explanatory variables
#'  - "inst" variables are lagged twice and used as instruments.
#'     If a variable is at the same time as instrument and as sdm-variable we
#'     have to increase the lags-order to avoid multicollinearity issues
#'     \insertCite{Dargel2021}{spflow}.
#'
#' @references \insertAllCited{}
#' @importFrom Matrix drop0
#' @name derive_spflow_matrices
#' @keywords internal
#' @return A list containing all design matrices required for estimation, impact calculation and prediction
derive_spflow_matrices <- function(
  id_net_pair,
  spflow_networks,
  spflow_formula,
  spflow_control,
  na_rm = FALSE) {


  spflow_data <- pull_spflow_data(spflow_networks)
  spflow_indicators <- subset_keycols(spflow_data[["pair"]], drop_keys = FALSE)
  do_indexes <- cbind(as.integer(spflow_indicators[[1]]), as.integer(spflow_indicators[[2]]))

  od_ids <- id(pull_member(spflow_networks, id_net_pair))
  OW <- neighborhood(spflow_networks, od_ids["orig"])
  DW <- neighborhood(spflow_networks, od_ids["dest"])



  ### derive the different matrices
  spflow_matrices <- named_list(c("CONST", "D_", "O_", "I_", "P_", "Y_"))
  fourmulas_by_part <- interpret_spflow_formula(spflow_formula, spflow_control)
  fourmulas_by_source <- translist(fourmulas_by_part)

  spflow_matrices[["CONST"]] <- derive_spflow_constants(
    use_global_const = fourmulas_by_part[["constants"]][["global"]],
    use_intra_const = isTRUE(fourmulas_by_part[["constants"]][["intra"]]),
    use_instruments = spflow_control[["estimation_method"]] == "s2sls",
    spflow_indicators = spflow_indicators,
    OW = OW, DW = DW)

  ## transform and lag node data
  assert_NA <- function(matrix_key) assert(
    na_rm,"NA's generated by formula part %s(...)!", matrix_key)
  complete_nodeobs <- function(matrix_key) {

    obs_X <- spflow_matrices[[matrix_key]] %|!|% complete.cases
    if (all(obs_X))
      return(NULL)

    assert_NA(matrix_key)
    return(obs_X)
  }
  model_cols <- function(x) Filter(Negate(is.character), subset_keycols(x))

  spflow_matrices[["D_"]] <- transform_node_data(
    threepart_formula = fourmulas_by_source[["D_"]],
    node_df = model_cols(spflow_data[["dest"]]),
    W = DW)
  obs_D <- complete_nodeobs("D_")


  spflow_matrices[["O_"]] <- transform_node_data(
    threepart_formula = fourmulas_by_source[["O_"]],
    node_df = model_cols(spflow_data[["orig"]]),
    W = OW)
  obs_O <- complete_nodeobs("O_")


  spflow_matrices[["I_"]] <- transform_node_data(
    threepart_formula = fourmulas_by_source[["I_"]],
    node_df = model_cols(spflow_data[["orig"]]),
    W = OW)
  obs_I <- complete_nodeobs("I_")


  ## transform and lag od-pair attributes
  incomplete_pairobs <- function(matrix_key) {

    miss_X <- lapply(spflow_matrices[[matrix_key]], is.na)
    miss_X <- Reduce("|", miss_X)

    if (!any(miss_X))
      return(NULL)

    assert_NA(matrix_key)
    return(miss_X)
  }
  remove_na_pairs <- function(na_pos, matrix_list) {

    if (is.null(na_pos) | is.null(matrix_list))
      return(matrix_list)


    matrix_list <- lapply(matrix_list, function(mat) {

      attr_inst <- attr_inst_status(mat)
      mat <- na2zero(mat)
      is_sparse <- (nnzero(mat) * 2) < length(mat)
      mat <- as(mat, ifelse(is_sparse, "Matrix", "matrix"))
      attr_inst_status(mat) <- attr_inst
      mat
    })

    return(matrix_list)
  }

  spflow_matrices[["P_"]] <- transform_pair_data(
    threepart_formula = fourmulas_by_source[["P_"]],
    pair_df = subset_keycols(spflow_data[["pair"]]),
    spflow_indicators = spflow_indicators,
    OW = OW, DW = DW,
    reduce_pair_instruments = isTRUE(spflow_control[["twosls_reduce_pair_instruments"]]))
  na_G <- incomplete_pairobs("P_")
  spflow_matrices[["P_"]] <- remove_na_pairs(na_G, spflow_matrices[["P_"]])

  ## transform and lag od-flows
  spflow_matrices[["Y_"]] <- transform_flow_data(
    threepart_formula = fourmulas_by_source[["Y_"]],
    flow_df = subset_keycols(spflow_data[["pair"]]),
    spflow_indicators = spflow_indicators,
    OW = OW, DW = DW,
    model = spflow_control[["model"]])
  na_Y <- incomplete_pairobs("Y_")
  actual_Y <- spflow_matrices[["Y_"]][[1]][do_indexes]
  spflow_matrices[["Y_"]] <- remove_na_pairs(na_Y, spflow_matrices[["Y_"]])

  ### identify which information sets are available for each od pair
  has_ZZ <- NULL # units with known signal (for predictions)
  has_ZY <- NULL # units with known signal and flows-lags (for fitting)

  if (!all(obs_D))
    has_ZZ <- update_logicals(has_ZZ, obs_D[do_indexes[1]])
  if (!all(obs_O))
    has_ZZ <- update_logicals(has_ZZ, obs_O[do_indexes[2]])
  if (!all(obs_I))
    has_ZZ <- update_logicals(has_ZZ, (obs_I[do_indexes[2]] | spflow_indicators[[2]] != spflow_indicators[[1]]))
  if (!is.null(na_G))
    has_ZZ <- update_logicals(has_ZZ, !(na_G[do_indexes]))
  if (!is.null(na_Y))
    has_ZY <- update_logicals(has_ZZ, !(na_Y[do_indexes]))

  weights <- derive_spflow_weights(
    weight_funs = spflow_control[["weight_functions"]],
    spflow_data = spflow_data,
    do_indexes = do_indexes)

  spflow_indicators <- list2df(
    spflow_indicators,
    HAS_ZZ = has_ZZ,
    HAS_ZY = has_ZY,
    WEIGHT = weights,
    ACTUAL = actual_Y)

  return(c(spflow_matrices, list(spflow_indicators = spflow_indicators)))
}


# ---- matrix generators ------------------------------------------------------
#' @importFrom Matrix Diagonal
#' @keywords internal
derive_spflow_constants <- function(
  use_global_const,
  use_intra_const,
  use_instruments,
  spflow_indicators = NULL,
  OW = NULL,
  DW = NULL) {


  c_terms <- list("(Intercept)" =  `attr_inst_status<-`(1, !use_global_const))
  if (!use_intra_const)
    return(c_terms)

  n <- nlevels(spflow_indicators[[1]])
  c_terms <- c(c_terms, "(Intra)" = `attr_inst_status<-`(Diagonal(n), FALSE))
  if (!use_instruments)
    return(c_terms)

  # for computation of instruments the indicator is based on observed Y
  Y_indicator <- spflow_indicators2mat(spflow_indicators, do_filter = "IN_SAMPLE")
  intra_lags <- double_lag_matrix(
    M = Diagonal(n),
    OW = OW,
    DW = DW,
    name = "(Intra)",
    key = "I",
    M_indicator = Y_indicator,
    symmetric_lags = is.null(Y_indicator),
    lag_order = 2,
    return_all_lags = TRUE,
    lags_are_instruments = TRUE)
  return(c(c_terms, intra_lags[-1]))
}


#' @keywords internal
transform_node_data <- function(
  threepart_formula,
  node_df,
  W) {

  if (is.null(threepart_formula) | is.null(node_df))
    return(NULL)

  # transformation according to the formula
  combined_formula <- combine_rhs_formulas(threepart_formula)
  node_df <- subset_keycols(node_df, drop_keys = TRUE)
  node_mat <- flow_conform_model_matrix(combined_formula, node_df)
  lostobs <- get_lostobs(node_df, node_mat)
  node_mat <- impute_lost_cases(node_mat, lostobs, NA)

  # spatial lags for sdm and instrument parts
  var_use <- derive_variables_use(threepart_formula, node_df)
  lag_num2var <- lookup(as.integer(var_use[["num_lags"]]), row.names(var_use))
  node_mat <- add_lagged_cols(node_mat, W, lag_num2var)

  inst_status2var <- unlist(var_use[["inst_attr"]])
  inst_order2var <- c(which(!inst_status2var),which(inst_status2var))
  node_mat <- node_mat[,inst_order2var,drop = FALSE]
  attr_inst_status(node_mat) <- sort(inst_status2var)

  return(node_mat)
}


#' @keywords internal
transform_pair_data <- function(
  threepart_formula,
  pair_df,
  spflow_indicators,
  OW,
  DW,
  reduce_pair_instruments) {

  if (is.null(threepart_formula) | is.null(pair_df))
    return(NULL)

  # transformation according to the formula and concert to matrix-list format
  combined_formula <- combine_rhs_formulas(threepart_formula)
  pair_mat <- flow_conform_model_matrix(combined_formula, pair_df)
  lostobs <- get_lostobs(pair_df, pair_mat)
  pair_mat <- impute_lost_cases(pair_mat, lostobs, NA)
  pair_mat <- spflow_indicators2matlist(cbind(spflow_indicators, pair_mat))

  # spatial lags for instrument parts
  var_use <- derive_variables_use(threepart_formula, pair_df)
  lag_num2var <- lookup(as.integer(var_use[["num_lags"]]),row.names(var_use))
  pair_mat <-
    Reduce("c", lapply(lookup(names(pair_mat)),  function(.var) {
      double_lag_matrix(
        M = pair_mat[[.var]],
        DW = DW,
        OW = OW,
        name = .var,
        key = "M",
        M_indicator = spflow_indicators2mat(spflow_indicators),
        lag_order = lag_num2var[.var],
        return_all_lags = !reduce_pair_instruments,
        lags_are_instruments = TRUE)}))

  return(pair_mat)
}

#' @keywords internal
transform_flow_data <- function(
  threepart_formula,
  flow_df,
  spflow_indicators,
  OW,
  DW,
  model) {

  if (is.null(threepart_formula) | is.null(flow_df))
    return(NULL)


  combined_formula <- combine_rhs_formulas(threepart_formula)
  flow_mat <- flow_conform_model_matrix(combined_formula, flow_df)
  lostobs <- get_lostobs(flow_df, flow_mat)
  flow_mat <- impute_lost_cases(flow_mat, lostobs, NA)
  flow_mat <- spflow_indicators2matlist(cbind(spflow_indicators, flow_mat))

  flow_mat <- Reduce("c", Map(function(.var, .mat) {
    lag_flow_matrix(
      Y = .mat,
      model = model,
      DW = DW,
      OW = OW,
      name = .var,
      M_indicator = spflow_indicators2mat(spflow_indicators))},
    .var = names(flow_mat),
    .mat = flow_mat))

  return(flow_mat)
}

# ---- helpers ----------------------------------------------------------------
#' @keywords internal
pull_spflow_data <- function(.multinet, pair_id = id(.multinet)[["pairs"]][1]) {

  source_ids <- as.list(id(.multinet@pairs[[pair_id]]))
  flow_data <- lapply(source_ids, function(.id){
    source_data <- dat(.multinet, .id)
    row.names(source_data) <- NULL
    source_data
  })
  return(flow_data)
}

#' @keywords internal
get_keycols <- function(df, no_coords = FALSE) {
  keys <- c(attr_key_do(df), attr_key_nodes(df))
  if (no_coords)
    return(keys)

  return(c(keys, attr_coord_col(df)))
}

#' @keywords internal
subset_keycols <- function(df, drop_keys = TRUE) {
  keep_cols <- get_keycols(df)
  if (drop_keys) keep_cols <- setdiff(names(df), keep_cols)
  return(df[, keep_cols, drop = FALSE])
}

#' @keywords internal
flow_conform_model_matrix <- function(formula,data) {
  terms_obj <- terms(formula, data = data)
  attr(terms_obj,"intercept") <- formula_expands_factors(formula,data) * 1
  mat <- stats::model.matrix.lm(terms_obj,data,na.action = "na.pass")
  mat[,colnames(mat) != "(Intercept)", drop = FALSE]
}

#' @keywords internal
get_lostobs <- function(pre, trans) {
  if (is.null(pre) || is.null(trans) || nrow(trans) == nrow(pre))
    return(NULL)

  lost <- rep(TRUE, nrow(pre))
  lost[as.integer(row.names(trans))] <- FALSE
  return(lost)
}

#' @keywords internal
impute_lost_cases <- function(mat, lost_cases, imp = 0) {

  if (is.null(lost_cases))
    return(mat)

  assert(is.logical(lost_cases) & length(lost_cases >= nrow(mat)))
  if (sum(lost_cases) == nrow(mat))
    return(mat)

  xmat <- matrix(imp, nrow = length(lost_cases), ncol = ncol(mat), dimnames = list(NULL, colnames(mat)))
  xmat[!lost_cases, ] <- mat
  return(xmat)
}

#' @keywords internal
derive_variables_use <- function(
  formula_part,
  data_source) {

  var_use <- lapply(formula_part, "predict_tranfomed_vars", data_source)
  var_use_types <- named_list(c("norm","sdm","inst"))
  var_use_types[names(var_use)] <- var_use

  all_vars <- unique(unlist(var_use, use.names = FALSE))
  get_v_use <- function(.v) data.frame(lapply(var_use_types, function(.u) any(.u == .v)))
  var_use_df <- do.call("rbind", lapply(lookup(all_vars), get_v_use))
  var_use_df[["num_lags"]] <- var_use_df[["sdm"]] + 2 * var_use_df[["inst"]]

  inst_attr <- Map(
    function(norm, sdm, num_lags){

      inst_statu <- rep(TRUE, num_lags + 1)
      inst_statu[1] <- !norm
      if(num_lags >= 1)
        inst_statu[2] <- !sdm
      inst_statu
    },
    var_use_df$norm, var_use_df$sdm, var_use_df$num_lags)

  var_use_df[["inst_attr"]] <- I(inst_attr)
  var_use_df
}


#' @keywords internal
list2df <- function(...) {
  data.frame(Filter(function(x) length(x) != 0, list(...)))
}


#' @keywords internal
na2zero <- function(x) {

  if (is.atomic(x))
    x[is.na(x)] <- 0

  if (inherits(x,"Matrix"))
    x@x[is.na(x@x)] <- 0

  return(x)
}

#' @keywords internal
derive_spflow_weights <- function(weight_funs, spflow_data, do_indexes) {

  if (is.null(weight_funs))
    return(NULL)

  eval_wfun <- function(wkey) {
    wfun <- weight_funs[[wkey]]
    wdat <- spflow_data[[wkey]]

    if (is.null(wfun) | is.null(wdat))
      return(NULL)

    wt <- try(wfun(wdat), silent = TRUE)
    assert(is_one_of(wt, c("logical","numeric")),'
    Derivation of weights failed!
    Please ensure the weight functions for %s
    provided in spflow_control return a logical or a numeric.', wkey)

    if (wkey == "orig")
      wt <- wt[do_indexes[,2]]

    if (wkey == "dest")
      wt <- wt[do_indexes[,1]]

    return(wt)
  }
  wt_parts <- named_list(c("pair","orig","dest"))
  wt_parts[["pair"]] <- eval_wfun("pair")
  wt_parts[["orig"]] <- eval_wfun("orig")
  wt_parts[["dest"]] <- eval_wfun("dest")

  wt_is_logical <- all(sapply(wt_parts, is.logical))
  wt <- Reduce(ifelse(wt_is_logical,"&","*"), wt_parts)
  wt[wt<0] <- NA
  wt[!is.finite(wt)] <- ifelse(wt_is_logical,FALSE,0)
  return(wt)
}