R/sar.R
In CDatanet: Econometrics of Network Data

Documented in sar simsar

#' @title Simulating Data from Linear-in-Mean Models with Social Interactions
#' @param formula A symbolic description of the model, passed as a class object of type \link[stats]{formula}. 
#'        The formula must specify the endogenous variable and control variables, for example: 
#'        \code{y ~ x1 + x2 + gx1 + gx2}, where `y` is the endogenous vector, 
#'        and `x1`, `x2`, `gx1`, and `gx2` are the control variables, which may include contextual variables (peer averages). 
#'        Peer averages can be computed using the function \code{\link{peer.avg}}.
#' @param Glist A list of network adjacency matrices representing multiple subnets. The \code{m}-th element in the list should be an 
#'        \code{ns * ns} matrix, where \code{ns} is the number of nodes in the \code{m}-th subnet.
#' @param theta A numeric vector defining the true values of the model parameters \eqn{\theta = (\lambda, \Gamma, \sigma)}. 
#'        These parameters are used to define the model specification in the details section.
#' @param cinfo A Boolean flag indicating whether the information is complete (`cinfo = TRUE`) or incomplete (`cinfo = FALSE`). 
#'        If information is incomplete, the model operates under rational expectations.
#' @param data An optional data frame, list, or environment (or an object coercible by \link[base]{as.data.frame} to a data frame) 
#'        containing the variables in the model. If not provided, the variables are taken from the environment of the function call.
#' @description
#' `simsar` simulates continuous variables under linear-in-mean models with social interactions, following the specifications described 
#' in Lee (2004) and Lee et al. (2010). The model incorporates peer interactions, where the value of an individual’s outcome depends 
#' not only on their own characteristics but also on the average characteristics of their peers in the network.
#' @references  
#' Lee, L. F. (2004). Asymptotic distributions of quasi-maximum likelihood estimators for spatial autoregressive models. \emph{Econometrica}, 72(6), 1899-1925, \doi{10.1111/j.1468-0262.2004.00558.x}.
#' @references  
#' Lee, L. F., Liu, X., & Lin, X. (2010). Specification and estimation of social interaction models with network structures. The Econometrics Journal, 13(2), 145-176, \doi{10.1111/j.1368-423X.2010.00310.x}
#' @details 
#' In the complete information model, the outcome \eqn{y_i} for individual \eqn{i} is defined as:
#' \deqn{y_i = \lambda \bar{y}_i + \mathbf{z}_i'\Gamma + \epsilon_i,}
#' where \eqn{\bar{y}_i} represents the average outcome \eqn{y} among individual \eqn{i}'s peers, 
#' \eqn{\mathbf{z}_i} is a vector of control variables, and \eqn{\epsilon_i \sim N(0, \sigma^2)} is the error term. 
#' In the case of incomplete information models with rational expectations, the outcome \eqn{y_i} is defined as:
#' \deqn{y_i = \lambda E(\bar{y}_i) + \mathbf{z}_i'\Gamma + \epsilon_i,}
#' where \eqn{E(\bar{y}_i)} is the expected average outcome of \eqn{i}'s peers, as perceived by individual \eqn{i}.
#' 
#' @return A list containing the following elements:
#'     \item{y}{the observed count data.}
#'     \item{Gy}{the average of y among friends.}
#' @seealso \code{\link{sar}}, \code{\link{simsart}}, \code{\link{simcdnet}}.
#' @examples 
#' \donttest{
#' # Groups' size
#' set.seed(123)
#' M      <- 5 # Number of sub-groups
#' nvec   <- round(runif(M, 100, 1000))
#' n      <- sum(nvec)
#' 
#' # Parameters
#' lambda <- 0.4
#' Gamma  <- c(2, -1.9, 0.8, 1.5, -1.2)
#' sigma  <- 1.5
#' theta  <- c(lambda, Gamma, sigma)
#' 
#' # X
#' X      <- cbind(rnorm(n, 1, 1), rexp(n, 0.4))
#' 
#' # Network
#' G      <- list()
#' 
#' for (m in 1:M) {
#'   nm           <- nvec[m]
#'   Gm           <- matrix(0, nm, nm)
#'   max_d        <- 30
#'   for (i in 1:nm) {
#'     tmp        <- sample((1:nm)[-i], sample(0:max_d, 1))
#'     Gm[i, tmp] <- 1
#'   }
#'   rs           <- rowSums(Gm); rs[rs == 0] <- 1
#'   Gm           <- Gm/rs
#'   G[[m]]       <- Gm
#' }
#' 
#' # data
#' data   <- data.frame(X, peer.avg(G, cbind(x1 = X[,1], x2 =  X[,2])))
#' colnames(data) <- c("x1", "x2", "gx1", "gx2")
#' 
#' ytmp    <- simsar(formula = ~ x1 + x2 + gx1 + gx2, Glist = G, 
#'                   theta = theta, data = data) 
#' y       <- ytmp$y
#' }
#' @importFrom Rcpp sourceCpp
#' @export
simsar   <- function(formula,
                     Glist,
                     theta,
                     cinfo = TRUE,
                     data) {
  stopifnot(length(cinfo) == 1)
  stopifnot(cinfo %in% c(TRUE, FALSE))
  if (!is.list(Glist)) {
    Glist  <- list(Glist)
  }
  
  M        <- length(Glist)
  nvec     <- unlist(lapply(Glist, nrow))
  n        <- sum(nvec)
  igr      <- matrix(c(cumsum(c(0, nvec[-M])), cumsum(nvec) - 1), ncol = 2)
  
  
  f.t.data <- formula.to.data(formula = formula, contextual = FALSE, Glist = Glist, M = M, igr = igr, 
                              data = data, type = "sim", theta0  = 0)
  X        <- f.t.data$X
  # print(colnames(X))
  K        <- length(theta)
  if(K != (ncol(X) + 2)) {
    stop("Length of theta is not suited.")
  }
  lambda   <- theta[1]
  b        <- theta[2:(K - 1)]
  sigma    <- theta[K ]
  
  
  
  xb       <- c(X %*% b)
  eps      <- rnorm(n, 0, sigma)
  
  out      <- NULL
  if(cinfo){
    y      <- rep(0, n)
    Gy     <- numeric(n)
    fySar(y, Gy, Glist, eps, igr, M, xb, lambda)
    out    <- list("y"  = y,
                   "Gy" = Gy)
  } else {
    y      <- rep(0, n)
    ye     <- rep(0, n)
    Gye    <- numeric(n)
    fySarRE(y, Gye, ye, Glist, eps, igr, M, xb, lambda)
    out    <- list("y"  = y,
                   "Gy" = Gy)
  }
  out
}


#' @title Estimating Linear-in-mean Models with Social Interactions
#' @param formula a class object \link[stats]{formula}: a symbolic description of the model. `formula` must be as, for example, \code{y ~ x1 + x2 + gx1 + gx2}
#' where `y` is the endogenous vector and `x1`, `x2`, `gx1` and `gx2` are control variables, which can include contextual variables, i.e. averages among the peers.
#' Peer averages can be computed using the function \code{\link{peer.avg}}.
#' @param Glist The network matrix. For networks consisting of multiple subnets, `Glist` can be a list of subnets with the `m`-th element being an `ns*ns` adjacency matrix, where `ns` is the number of nodes in the `m`-th subnet.
#' @param lambda0 an optional starting value of \eqn{\lambda}.
#' @param fixed.effects a Boolean indicating whether group heterogeneity must be included as fixed effects.
#' @param optimizer is either `nlm` (referring to the function \link[stats]{nlm}) or `optim` (referring to the function \link[stats]{optim}). 
#' Arguments for these functions such as, `control` and `method` can be set via the argument `opt.ctr`.
#' @param opt.ctr list of arguments of \link[stats]{nlm} or \link[stats]{optim} (the one set in `optimizer`) such as `control`, `method`, etc.
#' @param print a Boolean indicating if the estimate should be printed at each step.
#' @param cov a Boolean indicating if the covariance should be computed.
#' @param cinfo a Boolean indicating whether information is complete (`cinfo = TRUE`) or incomplete (`cinfo = FALSE`). In the case of incomplete information, the model is defined under rational expectations. 
#' @param data an optional data frame, list or environment (or object coercible by \link[base]{as.data.frame} to a data frame) containing the variables
#' in the model. If not found in data, the variables are taken from \code{environment(formula)}, typically the environment from which `sar` is called.
#' @description
#' `sar` computes quasi-maximum likelihood estimators for linear-in-mean models with social interactions (see Lee, 2004 and Lee et al., 2010).
#' @details 
#' In the complete information model, the outcome \eqn{y_i} for individual \eqn{i} is defined as:
#' \deqn{y_i = \lambda \bar{y}_i + \mathbf{z}_i'\Gamma + \epsilon_i,}
#' where \eqn{\bar{y}_i} represents the average outcome \eqn{y} among individual \eqn{i}'s peers, 
#' \eqn{\mathbf{z}_i} is a vector of control variables, and \eqn{\epsilon_i \sim N(0, \sigma^2)} is the error term. 
#' In the case of incomplete information models with rational expectations, the outcome \eqn{y_i} is defined as:
#' \deqn{y_i = \lambda E(\bar{y}_i) + \mathbf{z}_i'\Gamma + \epsilon_i,}
#' where \eqn{E(\bar{y}_i)} is the expected average outcome of \eqn{i}'s peers, as perceived by individual \eqn{i}.
#' @references  
#' Lee, L. F. (2004). Asymptotic distributions of quasi-maximum likelihood estimators for spatial autoregressive models. \emph{Econometrica}, 72(6), 1899-1925, \doi{10.1111/j.1468-0262.2004.00558.x}.
#' @references  
#' Lee, L. F., Liu, X., & Lin, X. (2010). Specification and estimation of social interaction models with network structures. The Econometrics Journal, 13(2), 145-176, \doi{10.1111/j.1368-423X.2010.00310.x}
#' @seealso \code{\link{sart}}, \code{\link{cdnet}}, \code{\link{simsar}}.
#' @examples 
#' \donttest{
#' # Groups' size
#' set.seed(123)
#' M      <- 5 # Number of sub-groups
#' nvec   <- round(runif(M, 100, 1000))
#' n      <- sum(nvec)
#' 
#' # Parameters
#' lambda <- 0.4
#' Gamma  <- c(2, -1.9, 0.8, 1.5, -1.2)
#' sigma  <- 1.5
#' theta  <- c(lambda, Gamma, sigma)
#' 
#' # X
#' X      <- cbind(rnorm(n, 1, 1), rexp(n, 0.4))
#' 
#' # Network
#' G      <- list()
#' 
#' for (m in 1:M) {
#'   nm           <- nvec[m]
#'   Gm           <- matrix(0, nm, nm)
#'   max_d        <- 30
#'   for (i in 1:nm) {
#'     tmp        <- sample((1:nm)[-i], sample(0:max_d, 1))
#'     Gm[i, tmp] <- 1
#'   }
#'   rs           <- rowSums(Gm); rs[rs == 0] <- 1
#'   Gm           <- Gm/rs
#'   G[[m]]       <- Gm
#' }
#' 
#' # data
#' data   <- data.frame(X, peer.avg(G, cbind(x1 = X[,1], x2 =  X[,2])))
#' colnames(data) <- c("x1", "x2", "gx1", "gx2")
#' 
#' ytmp    <- simsar(formula = ~ x1 + x2 + gx1 + gx2, Glist = G, 
#'                   theta = theta, data = data) 
#' data$y  <- ytmp$y
#' 
#' out     <- sar(formula = y ~ x1 + x2 + gx1 + gx2, Glist = G, 
#'                optimizer = "optim", data = data)
#' summary(out)
#' }
#' @return A list consisting of:
#'     \item{info}{list of general information on the model.}
#'     \item{estimate}{Maximum Likelihood (ML) estimator.}
#'     \item{cov}{covariance matrix of the estimate.}
#'     \item{details}{outputs as returned by the optimizer.}
#' @export
sar <- function(formula,
                Glist, 
                lambda0       = NULL, 
                fixed.effects = FALSE,
                optimizer     = "optim",
                opt.ctr       = list(), 
                print         = TRUE, 
                cov           = TRUE,
                cinfo         = TRUE,
                data) {
  stopifnot(length(cinfo) == 1)
  stopifnot(cinfo %in% c(TRUE, FALSE))
  if(!cinfo) stop("Incomplete information is not supported in this version.")
  stopifnot(optimizer %in% c("optim", "nlm"))
  env.formula <- environment(formula)
  #size 
  if (!is.list(Glist)) {
    Glist    <- list(Glist)
  }
  M          <- length(Glist)
  nvec       <- unlist(lapply(Glist, nrow))
  n          <- sum(nvec)
  igr        <- matrix(c(cumsum(c(0, nvec[-M])), cumsum(nvec) - 1), ncol = 2)
  
  f.t.data   <- formula.to.data(formula, contextual = FALSE, Glist, M, igr, data, 
                                theta0 = NULL, fixed.effects = fixed.effects)
  formula    <- f.t.data$formula
  y          <- f.t.data$y
  Gy         <- f.t.data$Gy
  X          <- f.t.data$X
  coln       <- c("lambda", colnames(X), "sigma")
  
  K          <- ncol(X)
  
  # variables
  Nvec       <- sapply(Glist, nrow)
  
  XX         <- t(X)%*%X
  invXX      <- solve(XX)
  
  Ilist      <- lapply(1:M, function(w) diag(Nvec[w]))
  
  lambdat    <- NULL
  if (!is.null(lambda0)) {
    lambdat  <- log(lambda0/(1- lambda0))
  } else {
    Xtmp     <- cbind(f.t.data$Gy, X)
    b        <- solve(t(Xtmp)%*%Xtmp, t(Xtmp)%*%y)
    lambdat  <- log(max(b[1]/(1 - b[1]), 0.01))
  }
  
  
  # arguments
  if ((length(opt.ctr) == 0) & optimizer == "optim") {
    opt.ctr  <- list("method" = "Brent",
                     "upper"  = 37,
                     "lower"  = -710)
  }
  ctr        <- c(list(X = X,invXX = invXX, G = Glist, I = Ilist, n = n, y = y, Gy = Gy, 
                       ngroup = M, FE = fixed.effects, print = print), opt.ctr)
  
  if (optimizer == "optim") {
    ctr      <- c(ctr, list(par = lambdat, fn = foptimSAR))
    par0     <- "par"
    par1     <- "par"
    like     <- "value"
  } else {
    ctr     <- c(ctr, list(p = lambdat, f = foptimSAR))
    par0    <- "p"
    par1    <- "estimate"
    like    <- "minimum"
  }
  
  resSAR    <- do.call(get(optimizer), ctr)
  lambdat   <- resSAR[[par1]]
  llh       <- -resSAR[[like]]
  
  lambda    <- 1/(1 + exp(-lambdat))
  
  hessian   <- jacobSAR(lambda, X, invXX, XX, y, n, Glist, Ilist, Gy, M, igr, cov, fixed.effects)
  
  
  beta      <- hessian$beta
  sigma2    <- hessian$sigma2
  
  covout    <- NULL
  if(cov) {
    covout           <- hessian$cov
    colnames(covout) <- coln
    rownames(covout) <- coln
  }
  
  theta             <- c(lambda, beta, sqrt(sigma2))
  names(theta)      <- coln
  
  environment(formula) <- env.formula
  sdata                <- list(
    "formula"       = formula,
    "Glist"         = deparse(substitute(Glist)),
    "nfriends"      = unlist(lapply(Glist, function(u) sum(u > 0))) 
  )
  if (!missing(data)) {
    sdata              <- c(sdata, list("data" = deparse(substitute(data))))
  }  
  
  
  INFO                 <- list("M"             = M,
                               "n"             = n,
                               "fixed.effects" = fixed.effects,
                               "nlinks"        = unlist(lapply(Glist, function(u) sum(u > 0))),
                               "formula"       = formula,
                               "log.like"      = llh)
  
  out                  <- list("info"       = INFO,
                               "estimate"   = theta, 
                               "cov"        = covout,
                               "details"    = resSAR)
  class(out)           <- "sar"
  out
}

jacobSAR <- function(alpha, X, invXX, XX, y, N, G, I, Gy, ngroup, igroup, 
                     cov = TRUE, fixed.effects = FALSE) {
  Ay           <- (y - alpha * Gy)
  beta         <- invXX %*% (t(X) %*% Ay)
  Xbeta        <- X %*% beta
  s2           <- sum((Ay - Xbeta) ^ 2) / ifelse(fixed.effects, N - ngroup, N)
  nbeta        <- length(beta)
  
  covout       <- NULL
  if (cov) {
    covout   <- fSARjac(alpha, s2, X, XX, Xbeta, G, I, igroup, ngroup, N, 
                        nbeta, fixed.effects)
  }
  
  
  list("alpha"   = alpha,
       "beta"    = beta,
       "sigma2"  = s2,
       "cov"     = covout)
}


#' @title Summary for the Estimation of Linear-in-mean Models with Social Interactions
#' @description Summary and print methods for the class `sar` as returned by the function \code{\link{sar}}.
#' @param object an object of class `sar`, output of the function \code{\link{sar}}.
#' @param x an object of class `summary.sar`, output of the function \code{\link{summary.sar}} or 
#' class `sar`, output of the function \code{\link{sar}}.
#' @param ... further arguments passed to or from other methods.
#' @return A list of the same objects in `object`.
#' @export
"summary.sar" <- function(object,
                          ...) {
  stopifnot(class(object) == "sar")
  out           <- c(object, list("..."       = ...)) 
  if(is.null(object$cov)){
    stop("Covariance was not computed")
  }
  class(out)    <- "summary.sar"
  out
}


#' @rdname summary.sar
#' @export
"print.summary.sar"  <- function(x, ...) {
  stopifnot(class(x) == "summary.sar")
  
  M                    <- x$info$M
  n                    <- x$info$n
  estimate             <- x$estimate
  formula              <- x$info$formula
  K                    <- length(estimate)
  coef                 <- estimate[-K]
  std                  <- sqrt(diag(x$cov[-K, -K, drop = FALSE]))
  sigma                <- estimate[K]
  llh                  <- x$info$log.like
  
  
  tmp                  <- fcoefficients(coef, std)
  out_print            <- tmp$out_print
  out                  <- tmp$out
  out_print            <- c(list(out_print), x[-(1:4)], list(...))
  
  
  nfr                  <- x$info$nlinks
  cat("SAR Model\n\n")
  cat("Call:\n")
  cat(paste0(formula, ", fixed.effects = ", x$info$fixed.effects), "\n")
  # print(formula)
  cat("\nMethod: Quasi-Maximum Likelihood (ML)", "\n\n")
  
  cat("Network:\n")
  cat("Number of groups         : ", M, "\n")
  cat("Sample size              : ", n, "\n")
  cat("Average number of friends: ", sum(nfr)/n, "\n\n")
  
  cat("Coefficients:\n")
  do.call("print", out_print)
  cat("---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n")
  cat("sigma: ", sigma, "\n")
  cat("log likelihood: ", llh, "\n")
  
  invisible(x)
}

#' @rdname summary.sar
#' @export
"print.sar" <- function(x, ...) {
  stopifnot(class(x) == "sar")
  print(summary(x, ...))
}