R/simulate_BLP_dataset.R
In BLPestimatoR: Performs a BLP Demand Estimation

Documented in simulate_BLP_dataset

#' @useDynLib BLPestimatoR
#' @importFrom Rcpp sourceCpp
NULL

#'  This function creates a simulated BLP dataset.
#' @useDynLib BLPestimatoR
#' @param nmkt number of markets
#' @param nbrn number of products
#' @param Xlin character vector specifying the set of linear variables
#' @param Xexo character vector specifying the set of exogenous variables (subset of \code{Xlin})
#' @param Xrandom character vector specifying the set of random coefficients (subset of \code{Xlin})
#' @param instruments character vector specifying the set of instrumental variables
#'
#' @param true.parameters list with parameters of the DGP
#' \describe{
#' \item{\code{Xlin.true.except.price}}{"true" linear coefficients in utility function except price}
#' \item{\code{Xlin.true.price}}{"true" linear price coefficient in utility function}
#' \item{\code{Xrandom.true}}{"true" set of random coefficients}
#' \item{\code{instrument.effects}}{"true" coefficients of instrumental variables to explain endogenous price}
#' \item{\code{instrument.Xexo.effects}}{"true" coefficients of exogenous variables to explain endogenous price} }
#'
#' @param price.endogeneity list with arguments of the multivariate normal
#' distribution
#' \describe{
#' \item{\code{mean.xi}}{controls for the mean of the error term in the utility function}
#' \item{\code{mean.eita}}{controls for the mean of the error term in the price function}
#' \item{\code{cov}}{controls for the covariance of \code{xi} and \code{eita} }}
#'
#' @param printlevel 0 (no output) 1 (summary of generated data)
#' @param seed seed for the random number generator
#'
#' @return Returns a simulated BLP dataset.
#'
#' @details The dataset is balanced, so every market has the same amount of products.
#'  Only unobserved heterogeneity can be considered.
#'  Variables that enter the equation as a Random Coefficient or
#'  exogenously must be included in the set of linear variables.
#'  The \code{parameter.list} argument specifies the "true" effect on the
#'  individual utility for each component. Prices are generated endogenous
#'  as a function of exogenous variables and instruments, where the
#'  respective effect sizes are specified in \code{instrument.effects}
#'  and \code{instrument.Xexo.effects}. Error terms \code{xi} and \code{eita}
#'  are drawn from a multivariate normal distribution, whose
#'  parameters can be set in \code{price.endogeneity}. Market shares
#'  are generated by MLHS integration rule with 10000 nodes.
#'
#'
#' @importFrom stats dnorm
#' @importFrom stats pnorm
#' @importFrom stats qnorm
#' @importFrom stats rnorm
#' @importFrom stats runif
#' @importFrom stats pchisq
#' @importFrom stats na.omit
#' @importFrom stats optim
#' @importFrom stats model.frame
#' @importFrom stats model.matrix
#' @importFrom stats model.response
#' @importFrom stats na.fail
#' @importFrom stats optim
#' @importFrom Formula as.Formula
#' @importFrom mvQuad createNIGrid
#' @importFrom mvQuad rescale
#' @importFrom mvQuad getWeights
#' @importFrom mvQuad getNodes
#' @importFrom numDeriv hessian
#' @importFrom randtoolbox halton
#'
#' @examples
#' K<-2 #number of random coefficients
# example_data <- simulate_BLP_dataset(nmkt = 25,
#                                 nbrn = 20,
#                                 Xlin = c("price", "x1", "x2", "x3", "x4", "x5"),
#                                 Xexo = c("x1", "x2", "x3", "x4", "x5"),
#                                 Xrandom = paste0("x",1:K),
#                                 instruments = paste0("iv",1:10),
#                                 true.parameters = list(Xlin.true.except.price = rep(0.2,5),
#                                                        Xlin.true.price = -0.2,
#                                                        Xrandom.true = rep(2,K),
#                                                        instrument.effects = rep(2,10),
#                                                        instrument.Xexo.effects = rep(1,5)),
#                                 price.endogeneity = list( mean.xi = -2,
#                                                           mean.eita = 0,
#                                                           cov = cbind( c(1,0.7), c(0.7,1))),
#                                 printlevel = 0, seed = 234234 )
#'
#' @export
simulate_BLP_dataset <- function( nmkt, nbrn,
                             Xlin, Xexo, Xrandom, instruments,
                             true.parameters = list(),
                             price.endogeneity = list( mean.xi = -2,
                                                       mean.eita = 0,
                                                       cov = cbind( c(1,0.7), c(0.7,1))),
                             printlevel=1 , seed)
    {


    ## input checks ----
    ### Existence check of necessary arguments
        # (collecting all arguments as a list, implicitly requires the arguments to be non-empty,
        # and lapply checks whether list arguments are non empty, i.e. are null)
        if( missing(nmkt) || missing(nbrn) || missing(Xlin) ||
            missing(Xexo) || missing(Xrandom) || missing(instruments)){
          stop("Include arguments nmkt, nbrn, Xlin, Xexo, Xrandom and instruments")
        }


        toBeTested<- list("true.parameters$Xlin.true.except.price" = true.parameters$Xlin.true.except.price,
                          "true.parameters$Xlin.true.price" = true.parameters$Xlin.true.price,
                          "true.parameters$Xrandom.true" = true.parameters$Xrandom.true,
                          "true.parameters$instrument.effects" = true.parameters$instrument.effects,
                          "true.parameters$instrument.Xexo.effects" = true.parameters$instrument.Xexo.effects,
                          "price.endogeneity$mean.xi" = price.endogeneity$mean.xi,
                          "price.endogeneity$mean.eita" = price.endogeneity$mean.eita,
                          "price.endogeneity$cov" = price.endogeneity$cov)

        names_toBeTested <- names(toBeTested)

        lapply(names_toBeTested,
               function(i){
                 if(is.null(toBeTested[[i]])) {
                   stop(paste("Argument",i,"is missing."))
                 } }
        )
    ### Content check
      if (is.numeric(seed) & (length(seed)>0) )
      {
          set.seed(seed)
      } else
      {
          seed <- NA
      }
      if (!("price" %in% Xlin))
          stop("Linear parameters must include a variable named *price* . ")
      if (!all(Xexo %in% Xlin) || !all(Xrandom %in% Xlin))
          stop("Linear parameters must include random coefficients and exogenous variables.")
      if (length(Xlin) != (length(true.parameters$Xlin.true.except.price) +
          1))
          stop("Number of linear parameters and true effects must match.")
      if (length(Xrandom) != length(true.parameters$Xrandom.true))
          stop("Number of random coefficients and true effects must match.")
      if (length(instruments) != length(true.parameters$instrument.effects))
          stop("Number of instruments and true effects must match.")

    ## intializing vectors ----
      nobs <- nmkt * nbrn
      nlin <- length(Xlin)  # number of lin. parameters
      ninst <- length(instruments)  # number of instruments
      K <- length(true.parameters$Xrandom.true)
      cdid <- sort(rep(1:nmkt, nbrn))
      prod_id <- rep(1:nbrn, nmkt)
      cdindex <- c(0, cumsum(table(cdid)))

      total.demographics <- 0



    ## eita & xi ----
      cov.xi.eita <- price.endogeneity$cov

      random.rv <- cbind(rnorm(nobs), rnorm(nobs))
      choleski <- t(chol(cov.xi.eita))
      xi.eita <- t(apply(  random.rv , 1 ,function(x) choleski %*% x) )
      xi <- xi.eita[, 1] + price.endogeneity$mean.xi
      eita <- xi.eita[, 2] + price.endogeneity$mean.eita


    ## generate data matrices ----
      instruments.data <- vapply(instruments, function(x) runif(nobs, 0, 2), numeric(nobs))  #;% cost shifters
      Xlin.data <- vapply(Xlin, function(x) runif(nobs, 0, 2), numeric(nobs))
      Xlin.data[, "price"] <- NA  # %; price is generated as a dependent variable of instruments and Xexo
      Xexo.data <- vapply(Xexo, function(x) Xlin.data[, x], numeric(nobs))
      Xlin.data[, "price"] <- instruments.data %*% true.parameters$instrument.effects +
          Xexo.data %*% true.parameters$instrument.Xexo.effects + eita
      Xrandom.data <- vapply(Xrandom, function(x) Xlin.data[, x], numeric(nobs))


    # true marketshares ----
      integration.list <- get_integration_input(dim = K, method = "MLHS",
                                                accuracy = 10000, nmkt = nmkt, seed = seed)

      draws_mktShape <- integration.list$nodesMktShape

      deltatrue <- c(Xlin.data[, "price"] * true.parameters$Xlin.true.price +
                       Xlin.data[, -which(Xlin == "price"), drop = FALSE] %*% matrix(true.parameters$Xlin.true.except.price) +
                        xi)

      deltatrue.exp <- exp(deltatrue)
      theta2.matrix <- matrix(true.parameters$Xrandom.true)  # ;% could also include demographic effects in subsequent columns
      expmu <- getExpMu(theta2.matrix,
                        draws_mktShape,
                        Xrandom.data,
                        cdid,
                        demographics = matrix(NA))

      sij <- getSij(expmu,
                    deltatrue.exp,
                    cdindex)

      shares <- c(sij %*% matrix(integration.list$weights))

    ## message and output ----
      if (printlevel > 1)
      {
          cat("You are now working with:", "\n", K, "random coefficient(s) with true values: ",
              true.parameters$Xrandom.true, ",\n", total.demographics,
              "demographics, \n", nmkt, "markets,", "\n", nbrn,
              "products and", "\n", ninst, "instruments.", "\n",
              "The following variables enter the problem linearly: ",
              Xlin, "\n", "Exogenous (i.e. no correlation to the structural error term) variables are:",
              Xexo, "\n", "Variables that are used as random coefficient: ",
              Xrandom, "\n")
      }


      BLP.data <- cbind(Xlin.data, instruments.data, cdid, shares,
          deltatrue,prod_id)
      BLP.dataframe <- data.frame(BLP.data)
      return(BLP.dataframe)

}