#' @title Compute the design matrices used during the estimation with [spflow()]
#'
#' @description
#' These are internal functions called within the estimation procedure
#'
#' @details
#' The key to an efficient estimation is to preserve the relational
#' representation of the data for origins, destinations and
#' origins-destinations pairs.
#' This requires to be aware of the following;
#' - there are up to three sources of data: OD-pairs & origin-nodes & destination-nodes
#' - each variable may be used in three different ways: directly & as a spatial lag
#' (like in the SDM) & as an instrument for the S2SLS estimator
#' - the model matrices can be split into five groups
#' 1. "Y_" = OD-flows (the dependent variable, in matrix form)
#' 2. "P_" = OD-pair attributes (in matrix form)
#' 3. "D_" = destination attributes
#' 4. "O_" = origin attributes
#' 5. "I_" = intra-regional attributes
#'
#' The additional separation of model matrices and data-sources makes sense if
#' the list of origins coincides with the list of destinations.
#' In this case, we can use data from the same set of nodes as origin,
#' destination, and intra-regional characteristics and each of these enter
#' the model in different ways.
#'
#' The model formula interface in [spflow()] is used to specify, how the variables
#' in the data-sources are used.
#' Any transformations of variables are handled by R's build-in tools and
#' spatial lags, that are specified in the augments `sdm_variables` and
#' `twosls_instrumental_variables` to [spflow_control()] are calculated after
#' transformations have been applied.
#' Below is an explanation of the formula parts:
#' - "norm" variables are not lagged
#' - "sdm" variables are lagged once and used as explanatory variables
#' - "inst" variables are lagged twice and used as instruments.
#' If a variable is at the same time as instrument and as sdm-variable we
#' have to increase the lags-order to avoid multicollinearity issues
#' \insertCite{Dargel2021}{spflow}.
#'
#' @references \insertAllCited{}
#' @importFrom Matrix drop0
#' @name derive_spflow_matrices
#' @keywords internal
#' @return A list containing all design matrices required for estimation, impact calculation and prediction
derive_spflow_matrices <- function(
id_net_pair,
spflow_networks,
spflow_formula,
spflow_control,
na_rm = FALSE) {
spflow_data <- pull_spflow_data(spflow_networks)
spflow_indicators <- subset_keycols(spflow_data[["pair"]], drop_keys = FALSE)
do_indexes <- cbind(as.integer(spflow_indicators[[1]]), as.integer(spflow_indicators[[2]]))
od_ids <- id(pull_member(spflow_networks, id_net_pair))
OW <- neighborhood(spflow_networks, od_ids["orig"])
DW <- neighborhood(spflow_networks, od_ids["dest"])
### derive the different matrices
spflow_matrices <- named_list(c("CONST", "D_", "O_", "I_", "P_", "Y_"))
fourmulas_by_part <- interpret_spflow_formula(spflow_formula, spflow_control)
fourmulas_by_source <- translist(fourmulas_by_part)
spflow_matrices[["CONST"]] <- derive_spflow_constants(
use_global_const = fourmulas_by_part[["constants"]][["global"]],
use_intra_const = isTRUE(fourmulas_by_part[["constants"]][["intra"]]),
use_instruments = spflow_control[["estimation_method"]] == "s2sls",
spflow_indicators = spflow_indicators,
OW = OW, DW = DW)
## transform and lag node data
assert_NA <- function(matrix_key) assert(
na_rm,"NA's generated by formula part %s(...)!", matrix_key)
complete_nodeobs <- function(matrix_key) {
obs_X <- spflow_matrices[[matrix_key]] %|!|% complete.cases
if (all(obs_X))
return(NULL)
assert_NA(matrix_key)
return(obs_X)
}
model_cols <- function(x) Filter(Negate(is.character), subset_keycols(x))
spflow_matrices[["D_"]] <- transform_node_data(
threepart_formula = fourmulas_by_source[["D_"]],
node_df = model_cols(spflow_data[["dest"]]),
W = DW)
obs_D <- complete_nodeobs("D_")
spflow_matrices[["O_"]] <- transform_node_data(
threepart_formula = fourmulas_by_source[["O_"]],
node_df = model_cols(spflow_data[["orig"]]),
W = OW)
obs_O <- complete_nodeobs("O_")
spflow_matrices[["I_"]] <- transform_node_data(
threepart_formula = fourmulas_by_source[["I_"]],
node_df = model_cols(spflow_data[["orig"]]),
W = OW)
obs_I <- complete_nodeobs("I_")
## transform and lag od-pair attributes
incomplete_pairobs <- function(matrix_key) {
miss_X <- lapply(spflow_matrices[[matrix_key]], is.na)
miss_X <- Reduce("|", miss_X)
if (!any(miss_X))
return(NULL)
assert_NA(matrix_key)
return(miss_X)
}
remove_na_pairs <- function(na_pos, matrix_list) {
if (is.null(na_pos) | is.null(matrix_list))
return(matrix_list)
matrix_list <- lapply(matrix_list, function(mat) {
attr_inst <- attr_inst_status(mat)
mat <- na2zero(mat)
is_sparse <- (nnzero(mat) * 2) < length(mat)
mat <- as(mat, ifelse(is_sparse, "Matrix", "matrix"))
attr_inst_status(mat) <- attr_inst
mat
})
return(matrix_list)
}
spflow_matrices[["P_"]] <- transform_pair_data(
threepart_formula = fourmulas_by_source[["P_"]],
pair_df = subset_keycols(spflow_data[["pair"]]),
spflow_indicators = spflow_indicators,
OW = OW, DW = DW,
reduce_pair_instruments = isTRUE(spflow_control[["twosls_reduce_pair_instruments"]]))
na_G <- incomplete_pairobs("P_")
spflow_matrices[["P_"]] <- remove_na_pairs(na_G, spflow_matrices[["P_"]])
## transform and lag od-flows
spflow_matrices[["Y_"]] <- transform_flow_data(
threepart_formula = fourmulas_by_source[["Y_"]],
flow_df = subset_keycols(spflow_data[["pair"]]),
spflow_indicators = spflow_indicators,
OW = OW, DW = DW,
model = spflow_control[["model"]])
na_Y <- incomplete_pairobs("Y_")
actual_Y <- spflow_matrices[["Y_"]][[1]][do_indexes]
spflow_matrices[["Y_"]] <- remove_na_pairs(na_Y, spflow_matrices[["Y_"]])
### identify which information sets are available for each od pair
has_ZZ <- NULL # units with known signal (for predictions)
has_ZY <- NULL # units with known signal and flows-lags (for fitting)
if (!all(obs_D))
has_ZZ <- update_logicals(has_ZZ, obs_D[do_indexes[1]])
if (!all(obs_O))
has_ZZ <- update_logicals(has_ZZ, obs_O[do_indexes[2]])
if (!all(obs_I))
has_ZZ <- update_logicals(has_ZZ, (obs_I[do_indexes[2]] | spflow_indicators[[2]] != spflow_indicators[[1]]))
if (!is.null(na_G))
has_ZZ <- update_logicals(has_ZZ, !(na_G[do_indexes]))
if (!is.null(na_Y))
has_ZY <- update_logicals(has_ZZ, !(na_Y[do_indexes]))
weights <- derive_spflow_weights(
weight_funs = spflow_control[["weight_functions"]],
spflow_data = spflow_data,
do_indexes = do_indexes)
spflow_indicators <- list2df(
spflow_indicators,
HAS_ZZ = has_ZZ,
HAS_ZY = has_ZY,
WEIGHT = weights,
ACTUAL = actual_Y)
return(c(spflow_matrices, list(spflow_indicators = spflow_indicators)))
}
# ---- matrix generators ------------------------------------------------------
#' @importFrom Matrix Diagonal
#' @keywords internal
derive_spflow_constants <- function(
use_global_const,
use_intra_const,
use_instruments,
spflow_indicators = NULL,
OW = NULL,
DW = NULL) {
c_terms <- list("(Intercept)" = `attr_inst_status<-`(1, !use_global_const))
if (!use_intra_const)
return(c_terms)
n <- nlevels(spflow_indicators[[1]])
c_terms <- c(c_terms, "(Intra)" = `attr_inst_status<-`(Diagonal(n), FALSE))
if (!use_instruments)
return(c_terms)
# for computation of instruments the indicator is based on observed Y
Y_indicator <- spflow_indicators2mat(spflow_indicators, do_filter = "IN_SAMPLE")
intra_lags <- double_lag_matrix(
M = Diagonal(n),
OW = OW,
DW = DW,
name = "(Intra)",
key = "I",
M_indicator = Y_indicator,
symmetric_lags = is.null(Y_indicator),
lag_order = 2,
return_all_lags = TRUE,
lags_are_instruments = TRUE)
return(c(c_terms, intra_lags[-1]))
}
#' @keywords internal
transform_node_data <- function(
threepart_formula,
node_df,
W) {
if (is.null(threepart_formula) | is.null(node_df))
return(NULL)
# transformation according to the formula
combined_formula <- combine_rhs_formulas(threepart_formula)
node_df <- subset_keycols(node_df, drop_keys = TRUE)
node_mat <- flow_conform_model_matrix(combined_formula, node_df)
lostobs <- get_lostobs(node_df, node_mat)
node_mat <- impute_lost_cases(node_mat, lostobs, NA)
# spatial lags for sdm and instrument parts
var_use <- derive_variables_use(threepart_formula, node_df)
lag_num2var <- lookup(as.integer(var_use[["num_lags"]]), row.names(var_use))
node_mat <- add_lagged_cols(node_mat, W, lag_num2var)
inst_status2var <- unlist(var_use[["inst_attr"]])
inst_order2var <- c(which(!inst_status2var),which(inst_status2var))
node_mat <- node_mat[,inst_order2var,drop = FALSE]
attr_inst_status(node_mat) <- sort(inst_status2var)
return(node_mat)
}
#' @keywords internal
transform_pair_data <- function(
threepart_formula,
pair_df,
spflow_indicators,
OW,
DW,
reduce_pair_instruments) {
if (is.null(threepart_formula) | is.null(pair_df))
return(NULL)
# transformation according to the formula and concert to matrix-list format
combined_formula <- combine_rhs_formulas(threepart_formula)
pair_mat <- flow_conform_model_matrix(combined_formula, pair_df)
lostobs <- get_lostobs(pair_df, pair_mat)
pair_mat <- impute_lost_cases(pair_mat, lostobs, NA)
pair_mat <- spflow_indicators2matlist(cbind(spflow_indicators, pair_mat))
# spatial lags for instrument parts
var_use <- derive_variables_use(threepart_formula, pair_df)
lag_num2var <- lookup(as.integer(var_use[["num_lags"]]),row.names(var_use))
pair_mat <-
Reduce("c", lapply(lookup(names(pair_mat)), function(.var) {
double_lag_matrix(
M = pair_mat[[.var]],
DW = DW,
OW = OW,
name = .var,
key = "M",
M_indicator = spflow_indicators2mat(spflow_indicators),
lag_order = lag_num2var[.var],
return_all_lags = !reduce_pair_instruments,
lags_are_instruments = TRUE)}))
return(pair_mat)
}
#' @keywords internal
transform_flow_data <- function(
threepart_formula,
flow_df,
spflow_indicators,
OW,
DW,
model) {
if (is.null(threepart_formula) | is.null(flow_df))
return(NULL)
combined_formula <- combine_rhs_formulas(threepart_formula)
flow_mat <- flow_conform_model_matrix(combined_formula, flow_df)
lostobs <- get_lostobs(flow_df, flow_mat)
flow_mat <- impute_lost_cases(flow_mat, lostobs, NA)
flow_mat <- spflow_indicators2matlist(cbind(spflow_indicators, flow_mat))
flow_mat <- Reduce("c", Map(function(.var, .mat) {
lag_flow_matrix(
Y = .mat,
model = model,
DW = DW,
OW = OW,
name = .var,
M_indicator = spflow_indicators2mat(spflow_indicators))},
.var = names(flow_mat),
.mat = flow_mat))
return(flow_mat)
}
# ---- helpers ----------------------------------------------------------------
#' @keywords internal
pull_spflow_data <- function(.multinet, pair_id = id(.multinet)[["pairs"]][1]) {
source_ids <- as.list(id(.multinet@pairs[[pair_id]]))
flow_data <- lapply(source_ids, function(.id){
source_data <- dat(.multinet, .id)
row.names(source_data) <- NULL
source_data
})
return(flow_data)
}
#' @keywords internal
get_keycols <- function(df, no_coords = FALSE) {
keys <- c(attr_key_do(df), attr_key_nodes(df))
if (no_coords)
return(keys)
return(c(keys, attr_coord_col(df)))
}
#' @keywords internal
subset_keycols <- function(df, drop_keys = TRUE) {
keep_cols <- get_keycols(df)
if (drop_keys) keep_cols <- setdiff(names(df), keep_cols)
return(df[, keep_cols, drop = FALSE])
}
#' @keywords internal
flow_conform_model_matrix <- function(formula,data) {
terms_obj <- terms(formula, data = data)
attr(terms_obj,"intercept") <- formula_expands_factors(formula,data) * 1
mat <- stats::model.matrix.lm(terms_obj,data,na.action = "na.pass")
mat[,colnames(mat) != "(Intercept)", drop = FALSE]
}
#' @keywords internal
get_lostobs <- function(pre, trans) {
if (is.null(pre) || is.null(trans) || nrow(trans) == nrow(pre))
return(NULL)
lost <- rep(TRUE, nrow(pre))
lost[as.integer(row.names(trans))] <- FALSE
return(lost)
}
#' @keywords internal
impute_lost_cases <- function(mat, lost_cases, imp = 0) {
if (is.null(lost_cases))
return(mat)
assert(is.logical(lost_cases) & length(lost_cases >= nrow(mat)))
if (sum(lost_cases) == nrow(mat))
return(mat)
xmat <- matrix(imp, nrow = length(lost_cases), ncol = ncol(mat), dimnames = list(NULL, colnames(mat)))
xmat[!lost_cases, ] <- mat
return(xmat)
}
#' @keywords internal
derive_variables_use <- function(
formula_part,
data_source) {
var_use <- lapply(formula_part, "predict_tranfomed_vars", data_source)
var_use_types <- named_list(c("norm","sdm","inst"))
var_use_types[names(var_use)] <- var_use
all_vars <- unique(unlist(var_use, use.names = FALSE))
get_v_use <- function(.v) data.frame(lapply(var_use_types, function(.u) any(.u == .v)))
var_use_df <- do.call("rbind", lapply(lookup(all_vars), get_v_use))
var_use_df[["num_lags"]] <- var_use_df[["sdm"]] + 2 * var_use_df[["inst"]]
inst_attr <- Map(
function(norm, sdm, num_lags){
inst_statu <- rep(TRUE, num_lags + 1)
inst_statu[1] <- !norm
if(num_lags >= 1)
inst_statu[2] <- !sdm
inst_statu
},
var_use_df$norm, var_use_df$sdm, var_use_df$num_lags)
var_use_df[["inst_attr"]] <- I(inst_attr)
var_use_df
}
#' @keywords internal
list2df <- function(...) {
data.frame(Filter(function(x) length(x) != 0, list(...)))
}
#' @keywords internal
na2zero <- function(x) {
if (is.atomic(x))
x[is.na(x)] <- 0
if (inherits(x,"Matrix"))
x@x[is.na(x@x)] <- 0
return(x)
}
#' @keywords internal
derive_spflow_weights <- function(weight_funs, spflow_data, do_indexes) {
if (is.null(weight_funs))
return(NULL)
eval_wfun <- function(wkey) {
wfun <- weight_funs[[wkey]]
wdat <- spflow_data[[wkey]]
if (is.null(wfun) | is.null(wdat))
return(NULL)
wt <- try(wfun(wdat), silent = TRUE)
assert(is_one_of(wt, c("logical","numeric")),'
Derivation of weights failed!
Please ensure the weight functions for %s
provided in spflow_control return a logical or a numeric.', wkey)
if (wkey == "orig")
wt <- wt[do_indexes[,2]]
if (wkey == "dest")
wt <- wt[do_indexes[,1]]
return(wt)
}
wt_parts <- named_list(c("pair","orig","dest"))
wt_parts[["pair"]] <- eval_wfun("pair")
wt_parts[["orig"]] <- eval_wfun("orig")
wt_parts[["dest"]] <- eval_wfun("dest")
wt_is_logical <- all(sapply(wt_parts, is.logical))
wt <- Reduce(ifelse(wt_is_logical,"&","*"), wt_parts)
wt[wt<0] <- NA
wt[!is.finite(wt)] <- ifelse(wt_is_logical,FALSE,0)
return(wt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.