R/point_estimation.R
In povmap: Extension to the 'emdi' Package

# Internal documentation -------------------------------------------------------

# Point estimation function

# This function implements the transformation of data, estimation of the nested
# error linear regression model and the monte-carlo approximation to predict
# the desired indicators. If the weighted version of the approach is used, then
# additional estimation steps are taken in order to calculate weighted
# regression coefficients before the monte-carlo approximation. See
# corresponding functions below.


point_estim <- function(framework,
                        fixed,
                        transformation,
                        interval,
                        L,
                        keep_data = FALSE,
                        Ydump = NULL) {

  # Transformation of data -----------------------------------------------------

  # Estimating the optimal parameter by optimization
  # Optimal parameter function returns the minimum of the optimization
  # functions from generic_opt; the minimum is the optimal lambda.
  # The function can be found in the script optimal_parameter.R
  optimal_lambda <- optimal_parameter(
    generic_opt = generic_opt,
    fixed = fixed,
    smp_data = framework$smp_data,
    smp_domains = framework$smp_domains,
    transformation = transformation,
    interval = interval,
    framework = framework
  )

  # Data_transformation function returns transformed data and shift parameter.
  # The function can be found in the script transformation_functions.R
  transformation_par <- data_transformation(
    fixed = fixed,
    smp_data = framework$smp_data,
    transformation = transformation,
    lambda = optimal_lambda
  )
  shift_par <- transformation_par$shift

  # Model estimation, model parameter and parameter of generating model --------

  # Estimation of the nested error linear regression model
  # See Molina and Rao (2010) p. 374
  # lme function is included in the nlme package which is imported.

  if(!is.null(framework$weights) &&
     any(framework$weights_type %in% c("nlme", "nlme_lambda"))) {

    transformation_par$transformed_data$weights_scaled <-
      framework$smp_data[,framework$weights] /
        mean(framework$smp_data[,framework$weights], na.rm = TRUE)

    mixed_model <- nlme::lme(
      fixed = fixed,
      data = transformation_par$transformed_data,
      random =
        as.formula(paste0("~ 1 | as.factor(",framework$smp_domains, ")")),
      method = framework$nlme_method,
      control = nlme::lmeControl(maxIter = framework$nlme_maxiter,
                                 tolerance = framework$nlme_tolerance,
                                 opt = framework$nlme_opt,
                                 optimMethod = framework$nlme_optimmethod, 
                                 msMaxIter=framework$nlme_msmaxiter,
                                 msTol=framework$nlme_mstol,
                                 returnObject = framework$nlme_returnobject 
                                 ),
      keep.data = keep_data,
      weights =
        varComb(
          varIdent(as.formula(
            paste0("~ 1 | as.factor(", framework$smp_domains, ")")
          )),
          varFixed(as.formula(paste0("~1/", "weights_scaled")))
        )
    )
  } else {
    mixed_model <- nlme::lme(
      fixed = fixed,
      data = transformation_par$transformed_data,
      random =
        as.formula(paste0("~ 1 | as.factor(",framework$smp_domains, ")")),
      method = framework$nlme_method,
      control = nlme::lmeControl(maxIter = framework$nlme_maxiter,
                                 tolerance = framework$nlme_tolerance,
                                 opt = framework$nlme_opt,
                                 optimMethod = framework$nlme_optimmethod, 
                                 msMaxIter=framework$nlme_msmaxiter,
                                 msTol=framework$nlme_mstol,
                                 returnObject = framework$nlme_returnobject 
                                 ),
      keep.data = keep_data
    )
  }



  # Function model_par extracts the needed parameters theta from the nested
  # error linear regression model. It returns the beta coefficients (betas),
  # sigmae2est, sigmau2est and the random effect (rand_eff).

  est_par <- model_par(
    mixed_model = mixed_model,
    framework = framework,
    fixed = fixed,
    transformation_par = transformation_par
  )

  # Function gen_model calculates the parameters in the generating model.
  # See Molina and Rao (2010) p. 375 (20)
  # The function returns sigmav2est and the constant part mu.
  gen_par <- gen_model(
    model_par = est_par,
    fixed = fixed,
    framework = framework
  )

  # Monte-Carlo approximation --------------------------------------------------
  if (inherits(framework$threshold, "function")) {
    framework$threshold <-
      framework$threshold(
        y =
          as.numeric(framework$smp_data[[paste0(fixed[2])]])
      )
  }

  # The monte-carlo function returns a data frame of desired indicators.
  indicator_prediction <- monte_carlo(
    transformation = transformation,
    L = L,
    framework = framework,
    lambda = optimal_lambda,
    shift = shift_par,
    model_par = est_par,
    gen_model = gen_par,
    fixed = fixed,
    Ydump = Ydump 
  )

  mixed_model$coefficients_weighted <- if (!is.null(framework$weights)) {
    as.numeric(est_par$betas)
  } else {
    NULL
  }
  names(mixed_model$coefficients_weighted) <- if (!is.null(framework$weights)) {
    rownames(est_par$betas)
  } else {
    NULL
  }
  return(list(
    ind = indicator_prediction,
    optimal_lambda = optimal_lambda,
    shift_par = shift_par,
    model_par = est_par,
    gen_model = gen_par,
    model = mixed_model
  ))
} # End point estimation function


# All following functions are only internal ------------------------------------

# Functions to extract and calculate model parameter----------------------------

# Function model_par extracts the needed parameters theta from the nested
# error linear regression model. It returns the beta coefficients (betas),
# sigmae2est, sigmau2est and the random effect (rand_eff).

model_par <- function(framework,
                      mixed_model,
                      fixed,
                      transformation_par) {

  # fixed parametersn
  betas <- nlme::fixed.effects(mixed_model)
  # Estimated error variance
  sigmae2est <- mixed_model$sigma^2
  # VarCorr(fit2) is the estimated random error variance
  sigmau2est <- as.numeric(nlme::VarCorr(mixed_model)[1, 1])
  # Random effect: vector with zeros for all domains, filled with 0
  rand_eff <- rep(0, length(unique(framework$pop_domains_vec)))
  
  
  if (is.null(framework$weights)) {
    # random effect for in-sample domains (dist_obs_dom)
    rand_eff[framework$dist_obs_dom] <- (random.effects(mixed_model)[[1]])
    
    return(list(
      betas = betas,
      sigmae2est = sigmae2est,
      sigmau2est = sigmau2est,
      rand_eff = rand_eff
    ))
} else if (any(framework$weights_type %in% c("nlme", "nlme_lambda"))) {
    rand_eff[framework$dist_obs_dom] <- (random.effects(mixed_model)[[1]])
    weight_sum <- rep(0, framework$N_dom_smp)
    #mean_dep <- rep(0, framework$N_dom_smp)
    #mean_indep <- matrix(0, nrow = framework$N_dom_smp, ncol = length(betas))
    delta2 <- rep(0, framework$N_dom_smp)
    gamma_weight <- rep(0, framework$N_dom_smp)
    for (d in 1:framework$N_dom_smp) {
      domain <- names(table(framework$smp_domains_vec)[d])
      weight_smp <- transformation_par$transformed_data[[
      as.character(framework$weights)]][framework$smp_domains_vec == domain]
      weight_sum[d] <- sum(weight_smp)
      delta2[d] <- sum(weight_smp^2) / (weight_sum[d]^2)
      gamma_weight[d] <- sigmau2est / (sigmau2est + sigmae2est * delta2[d])
      
      # Domain means of of the dependent variable
      #dep_smp <- transformation_par$transformed_data[[
      #  as.character(mixed_model$terms[[2]])]][
      #    framework$smp_domains_vec == domain
      #  ]
      # weighted mean of the dependent variable
      #mean_dep[d] <- sum(weight_smp * dep_smp) / weight_sum[d]
    
      # weighted means of the auxiliary information
      #indep_smp <- if(length(weight_smp) == 1) {
      #  matrix(model.matrix(fixed, framework$smp_data)[framework$smp_domains_vec == domain,]
      #         , ncol = length(betas), nrow = 1)
      #} else {
      #  model.matrix(fixed, framework$smp_data)[framework$smp_domains_vec == domain,]
      #}
      #for (k in 1:length(betas)) {
      #  mean_indep[d, k] <- sum(weight_smp * indep_smp[, k]) / weight_sum[d]
      #}
    }
      # random effect for in-sample domains (dist_obs_dom)
      #rand_eff[framework$dist_obs_dom] <- gamma_weight * (mean_dep -
       #                                                     mean_indep %*% betas)
      
      
      
    
    return(list(
      betas = betas,
      sigmae2est = sigmae2est,
      sigmau2est = sigmau2est,
      rand_eff = rand_eff,
      gammaw = gamma_weight,
      delta2 = delta2
    ))
}
  else {


    # Calculations needed for pseudo EB

    weight_sum <- rep(0, framework$N_dom_smp)
    mean_dep <- rep(0, framework$N_dom_smp)
    mean_indep <- matrix(0, nrow = framework$N_dom_smp, ncol = length(betas))
    delta2 <- rep(0, framework$N_dom_smp)
    gamma_weight <- rep(0, framework$N_dom_smp)
    num <- matrix(0, nrow = length(betas), ncol = 1)
    den <- matrix(0, nrow = length(betas), ncol = length(betas))

    for (d in 1:framework$N_dom_smp) {
      domain <- names(table(framework$smp_domains_vec)[d])

      # Domain means of of the dependent variable
      dep_smp <- transformation_par$transformed_data[[
      as.character(mixed_model$terms[[2]])]][
        framework$smp_domains_vec == domain
      ]
      weight_smp <- transformation_par$transformed_data[[
      as.character(framework$weights)]][framework$smp_domains_vec == domain]
      weight_sum[d] <- sum(weight_smp)

      indep_smp <- if(length(weight_smp) == 1) {
        matrix(model.matrix(fixed, framework$smp_data)[framework$smp_domains_vec == domain,]
               , ncol = length(betas), nrow = 1)
      } else {
        model.matrix(fixed, framework$smp_data)[framework$smp_domains_vec == domain,]
      }

      # weighted mean of the dependent variable
      mean_dep[d] <- sum(weight_smp * dep_smp) / weight_sum[d]

      # weighted means of the auxiliary information
      for (k in 1:length(betas)) {
        mean_indep[d, k] <- sum(weight_smp * indep_smp[, k]) / weight_sum[d]
      }

      delta2[d] <- sum(weight_smp^2) / (weight_sum[d]^2)
      gamma_weight[d] <- sigmau2est / (sigmau2est + sigmae2est * delta2[d])
      weight_smp_diag <- diag(weight_smp)
      dep_var_ast <- dep_smp - gamma_weight[d] * mean_dep[d]
      indep_weight <- t(indep_smp) %*% weight_smp_diag
      indep_var_ast <- indep_smp - matrix(rep(
        gamma_weight[d] *
          mean_indep[d, ],
        framework$n_smp[d]
      ),
      nrow = framework$n_smp[d],
      byrow = TRUE
      )



      num <- num + (indep_weight %*% dep_var_ast)
      den <- den + (indep_weight %*% indep_var_ast)
    }


    betas <- solve(den) %*% num

    # random effect for in-sample domains (dist_obs_dom)
    rand_eff[framework$dist_obs_dom] <- gamma_weight * (mean_dep -
      mean_indep %*% betas)


    return(list(
      betas = betas,
      sigmae2est = sigmae2est,
      sigmau2est = sigmau2est,
      rand_eff = rand_eff,
      gammaw = gamma_weight,
      delta2 = delta2
    ))
  }
} # End model_par



# Function gen_model calculates the parameters in the generating model.
# See Molina and Rao (2010) p. 375 (20)
gen_model <- function(fixed,
                      framework,
                      model_par) {

  if (is.null(framework$weights)) {

    # Parameter for calculating variance of new random effect
    gamma <- model_par$sigmau2est / (model_par$sigmau2est +
      model_par$sigmae2est / framework$n_smp)
    # Variance of new random effect
    sigmav2est <- model_par$sigmau2est * (1 - gamma)
    # Random effect in constant part of y for in-sample households
    rand_eff_pop <- rep(model_par$rand_eff, framework$n_pop)
    # Model matrix for population covariate information
    framework$pop_data[[paste0(fixed[2])]] <- seq_len(nrow(framework$pop_data))
    X_pop <- model.matrix(fixed, framework$pop_data)

    # Constant part of predicted y
    mu_fixed <- X_pop %*% model_par$betas
    mu <- mu_fixed + rand_eff_pop

    return(list(sigmav2est = sigmav2est, mu = mu, mu_fixed = mu_fixed))
  } 
  else if (any(framework$weights_type %in% c("nlme", "nlme_lambda"))) {
    #gamma_old <- model_par$sigmau2est / (model_par$sigmau2est +
                                     #model_par$sigmae2est / framework$n_smp)
    gamma <- model_par$gammaw
    sigmav2est <- model_par$sigmau2est * (1 - gamma)
    rand_eff_pop <- rep(model_par$rand_eff, framework$n_pop)
    framework$pop_data[[paste0(fixed[2])]] <- seq_len(nrow(framework$pop_data))
    X_pop <- model.matrix(fixed, framework$pop_data)
    
    # Constant part of predicted y
    mu_fixed <- X_pop %*% model_par$betas
    mu <- mu_fixed + rand_eff_pop
    return(list(sigmav2est = sigmav2est, mu = mu, mu_fixed = mu_fixed))
  }
  else {
    # Parameter for calculating variance of new random effect
    gamma <- model_par$gammaw
    # Variance of new random effect
    sigmav2est <- model_par$sigmau2est * (1 - gamma)
    # Random effect in constant part of y for in-sample households
    rand_eff_pop <- rep(model_par$rand_eff, framework$n_pop) ####### change
    # Model matrix for population covariate information
    framework$pop_data[[paste0(fixed[2])]] <- seq_len(nrow(framework$pop_data))
    X_pop <- model.matrix(fixed, framework$pop_data)

    # Constant part of predicted y
    mu_fixed <- X_pop %*% model_par$betas
    mu <- mu_fixed + rand_eff_pop


    return(list(sigmav2est = sigmav2est, mu = mu, mu_fixed = mu_fixed))
  }
} # End gen_model


# Monte-Carlo approximation ----------------------------------------------------

# The function approximates the expected value (Molina and Rao (2010)
# p.372 (6)). For description of monte-carlo simulation see Molina and
# Rao (2010) p. 373 (13) and p. 374-375
monte_carlo <- function(transformation,
                        L,
                        framework,
                        lambda,
                        shift,
                        model_par,
                        gen_model,
                        fixed,
                        Ydump) {

  # Preparing matrices for indicators for the Monte-Carlo simulation

  if(!is.null(framework$aggregate_to_vec)){
    N_dom_pop_tmp <- framework$N_dom_pop_agg
    pop_domains_vec_tmp <- framework$aggregate_to_vec
  } else {
    N_dom_pop_tmp <- framework$N_dom_pop
    pop_domains_vec_tmp <- framework$pop_domains_vec
  }


  if (!is.null(Ydump)) {
    Ydumpdf <- data.frame(matrix(ncol = 6, nrow = 0))
    colnames(Ydumpdf) <- c("L","Domain","Simulated_Y","XBetahat","eta","epsilon")
    write.csv(Ydumpdf,Ydump,row.names = FALSE)
  }

  
  ests_mcmc <- array(dim = c(
    N_dom_pop_tmp,
    L,
    length(framework$indicator_names)
  ))

  for (l in seq_len(L)) {

    # Errors in generating model: individual error term and random effect
    # See below for function errors_gen.
    errors <- errors_gen(
      framework = framework,
      model_par = model_par,
      gen_model = gen_model
    )

    # Prediction of population vector y
    # See below for function prediction_y.
    population_vector <- prediction_y(
      transformation = transformation,
      lambda = lambda,
      shift = shift,
      gen_model = gen_model,
      errors_gen = errors,
      framework = framework,
      fixed = fixed
    )

    if(!is.null(framework$pop_weights)){
      pop_weights_vec <- framework$pop_data[[framework$pop_weights]]
    }else{
      pop_weights_vec <- rep(1, nrow(framework$pop_data))
    }
    if (!is.null(Ydump)){
      Ydumpdf <- data.frame(rep(l,nrow(framework$pop_data)), framework$pop_domains_vec,population_vector,gen_model$mu,errors$vu,errors$epsilon)
      #write.csv(Ydumpdf,Ydump,row.names = FALSE,append=TRUE)
      write.table(Ydumpdf,file=Ydump,row.names = FALSE,append=TRUE,col.names=F, sep=",") 
    }

    # Calculation of indicators for each Monte Carlo population
    ests_mcmc[, l, ] <-
      matrix(
        nrow = N_dom_pop_tmp,
        data = unlist(lapply(framework$indicator_list,
          function(f, threshold) {
            matrix(
              nrow = N_dom_pop_tmp,
              data = unlist(mapply(
                y = split(population_vector, pop_domains_vec_tmp),
                pop_weights = split(pop_weights_vec, pop_domains_vec_tmp),
                f,
                threshold = framework$threshold
              )), byrow = TRUE
            )
          },
          threshold = framework$threshold
        ))
      )
  } # End for loop


  # Point estimations of indicators by taking the mean

  point_estimates <- data.frame(
    Domain = unique(pop_domains_vec_tmp),
    apply(ests_mcmc, c(3), rowMeans)
  )
  colnames(point_estimates) <- c("Domain", framework$indicator_names)
  return(point_estimates)
} # End Monte-Carlo


# The function errors_gen returns error terms of the generating model.
# See Molina and Rao (2010) p. 375 (20)

errors_gen <- function(framework, model_par, gen_model) {
  epsilon <- rnorm(framework$N_pop, 0, sqrt(model_par$sigmae2est))

  # empty vector for new random effect in generating model
  vu <- vector(length = framework$N_pop)
  # new random effect for out-of-sample domains
  vu[!framework$obs_dom] <- rep(
    rnorm(
      framework$N_dom_unobs,
      0,
      sqrt(model_par$sigmau2est)
    ),
    framework$n_pop[!framework$dist_obs_dom]
  )
  # new random effect for in-sample-domains
  vu[framework$obs_dom] <- rep(
    rnorm(
      rep(1, framework$N_dom_smp),
      0,
      sqrt(gen_model$sigmav2est)
    ),
    framework$n_pop[framework$dist_obs_dom]
  )
  # individual error term in generating model epsilon

  return(list(epsilon = epsilon, vu = vu))
} # End errors_gen

# The function prediction_y returns a predicted income vector which can be used
# to calculate indicators. Note that a whole income vector is predicted without
# distinction between in- and out-of-sample domains.
prediction_y <- function(transformation,
                         lambda,
                         shift,
                         gen_model,
                         errors_gen,
                         framework,
                         fixed) {

  # predicted population income vector
  y_pred <- gen_model$mu + errors_gen$epsilon + errors_gen$vu

  # back-transformation of predicted population income vector
  y_pred <- back_transformation(
    y = y_pred,
    transformation = transformation,
    lambda = lambda,
    shift = shift,
    framework = framework,
    fixed = fixed
  )
  y_pred[!is.finite(y_pred)] <- 0

  return(y_pred)
} # End prediction_y