R/model_quantreg_rsubbins.R

Defines functions model_quantreg_rsubbins

#' @export

model_quantreg_rsubbins <- function(formula, data_pnad, tau = .5,
                                    groups = NULL, confidence_intervals = F,
                                    binSampleSize = 1000,
                                    CI_lowerBound = .025, CI_upperBound = .975,
                                    nsim_CI = 2000){

        dep   = all.vars(formula[[2]])
        indep = all.vars(formula[[3]])

        if(any(indep %in% groups) | any(groups %in% indep)){
                stop("Group variables cannot be used as independent variables in the model")
        }

        if(is.null(groups)){
                data_pnad <- data_pnad %>%
                        mutate(ID = 1)

                IDs <- unique(data_pnad$ID)

                data_pnad <- data_pnad %>%
                        group_by_at(c("ID", "min_faixa", "max_faixa", indep)) %>%
                        summarise(n         = sum(n)) %>%
                        ungroup() %>%
                        arrange(ID, min_faixa)

        }else{
                data_pnad <-
                        data_pnad %>%
                        unite(col = ID, groups)

                IDs <- unique(data_pnad$ID)

                data_pnad <- data_pnad %>%
                        group_by_at(c("ID", "min_faixa", "max_faixa", indep)) %>%
                        summarise(n = sum(n)) %>%
                        ungroup() %>%
                        arrange(ID, min_faixa)
        }

        data_pnad <- tableInequality:::prepare_to_regression(data_pnad, indep)
        gc()

        data_split <- split(data_pnad, f = data_pnad$ID)

        #which(names(data_split) == 1431102)

        #data_i = data_split[[1]]

        regquantile <- function(data_i, formula, tau,
                                groups, confidence_intervals,
                                binSampleSize, dep, indep,
                                CI_lowerBound, CI_upperBound,
                                nsim_CI){

                ID_i <- data_i$ID %>% unique()

                data_grouped <- data_i %>%
                        unite(col = varIndepIDs, indep) %>%
                        group_by(varIndepIDs, min_faixa, max_faixa) %>%
                        summarise(n         = sum(n)) %>%
                        ungroup() %>%
                        arrange(varIndepIDs, min_faixa)

                data_p <- data_grouped %>%
                        group_by(varIndepIDs) %>%
                        summarise(n = sum(n)) %>%
                        ungroup() %>%
                        mutate(p = n/sum(n))

                X_groups = separate(data_p[,"varIndepIDs"],
                                    col = varIndepIDs, into = indep)
                formula_tmp = formula
                formula_tmp[[2]] = NULL
                X_groups = model.matrix(formula_tmp, data = X_groups)

                test_indeps <- tableInequality:::test_for_regression(data_i, indep)

                if(any(test_indeps)){
                        if(confidence_intervals == F){
                                coefs = matrix(rep(as.numeric(NA), ncol(X_groups)), nrow = 1)
                                colnames(coefs) = colnames(X_groups)
                                coefs <- as_tibble(coefs)
                                coefs <- bind_cols(tibble(ID = ID_i), coefs)
                                return(coefs)
                        }else{
                                coefs = matrix(rep(as.numeric(NA), ncol(X_groups)*3), nrow = 1)

                                colNames_tmp1 <- rep(colnames(X_groups), each = 3)
                                colNames_tmp2 <- rep(paste0("_P", c(CI_lowerBound, .50, CI_upperBound)), length(colnames(X_groups)))

                                colNames <- paste0(colNames_tmp1, colNames_tmp2)

                                colnames(coefs) = colNames
                                coefs <- as_tibble(coefs)
                                coefs <- bind_cols(tibble(ID = ID_i), coefs)
                                return(coefs)
                        }
                }

                data_split_indep <- split(data_grouped, data_grouped$varIndepIDs)

                rsub_fits <- lapply(data_split_indep, function(data_x){
                        try(rsubbins(bEdges  = c(1, data_x$max_faixa),
                                     bCounts = c(0,data_x$n)), silent = T)
                })

                quantile_functions <- lapply(rsub_fits, function(x) {
                        if("try-error" %in% class(x)){
                                function(p) rep(NA, length(p))
                        }else{
                                Vectorize(tableInequality:::inverse(x$rsubCDF,
                                                                           lower = 1,
                                                                           upper = x$E,
                                                                           extendInt = "yes"))
                        }
                })

                generate_simData_group = function(i, data_p, rsub_fits, quantile_functions, indep, nsims_i = NULL){

                        #print(i)
                        if(is.null(nsims_i)){
                                nsims_i <- data_p$n[i]
                        }

                        if("try-error" %in% class(rsub_fits[[i]])|nsims_i == 0){
                                return(NULL) #################################################TESTAR
                        }

                        X_values = separate(data_p[i,"varIndepIDs"],
                                            col = varIndepIDs, into = indep)

                        data_sim_i <- bind_rows(replicate(nsims_i, X_values , simplify = FALSE)) %>%
                                mutate(y   = quantile_functions[[i]](runif(nsims_i)),
                                       wgt = rsub_fits[[i]]$rsubPDF(y) * data_p[i,]$p)

                        data_sim_i
                }

                get_coef <- function(data_p,
                                     formula,
                                     rsub_fits,
                                     tau,
                                     quantile_functions,
                                     indep,
                                     nsims_i){

                        data_sim = map_dfr(.x = 1:length(rsub_fits),
                                           .f  = function(x) generate_simData_group(i = x,
                                                                                    data_p = data_p,
                                                                                    rsub_fits = rsub_fits,
                                                                                    quantile_functions = quantile_functions,
                                                                                    indep = indep, nsims_i = nsims_i))

                        y   = model.matrix(formula[1:2], data_sim)[, 2]
                        wgt = data_sim$wgt
                        X   = model.matrix(formula, data_sim)

                        obj = function(b, tau){
                                Indicador1 = as.numeric(y >= as.numeric(X%*%b))
                                Indicador2 = as.numeric(y < as.numeric(X%*%b))

                                sum(wgt*Indicador1*tau*abs(y - as.numeric(X%*%b))) + sum(wgt*Indicador2*(1 - tau)*abs(y - as.numeric(X%*%b)))
                        }

                        estimated_values <- try( nlm(f       = obj,
                                                     p       = rep(1, ncol(X)),
                                                     tau     = tau,
                                                     steptol = 1e-10,
                                                     gradtol = 1e-10,
                                                     iterlim = 1000),
                                                 silent = T)

                        if("try-error" %in% class(estimated_values)){
                                return(rep(NA, ncol(X)))
                        }else{
                                if(estimated_values$code == 5){
                                        return(rep(NA, ncol(X)))
                                }else{
                                        return(estimated_values$estimate)
                                }
                        }
                }


                if(confidence_intervals == F){

                        coefs <- get_coef(tau = tau,
                                          data_p    = data_p,
                                          formula   = formula,
                                          rsub_fits = rsub_fits,
                                          quantile_functions = quantile_functions,
                                          indep   = indep,
                                          nsims_i = binSampleSize)

                        coefs <- matrix(coefs, nrow = 1)

                        colnames(coefs) = colnames(X_groups)
                        coefs <- as_tibble(coefs)
                        coefs <- bind_cols(tibble(ID = ID_i), coefs)
                        return(coefs)

                }else{

                        coeficients_distribution <- future_map(.x = 1:nsim_CI,
                                                               .f = function(x){
                                                                       get_coef(tau       = tau,
                                                                                data_p    = data_p,
                                                                                formula   = formula,
                                                                                rsub_fits = rsub_fits,
                                                                                quantile_functions = quantile_functions,
                                                                                indep     = indep,
                                                                                nsims_i   = NULL)},
                                                               .progress = T)

                        coeficients_distribution <- do.call(rbind, coeficients_distribution)

                        coef_ConfInterval <- apply(coeficients_distribution, 2, quantile, probs = c(CI_lowerBound, .5, CI_upperBound))
                        dim(coef_ConfInterval) = NULL
                        coef_ConfInterval = matrix(coef_ConfInterval, nrow = 1)


                        colNames_tmp1 <- rep(colnames(X_groups), each = 3)
                        colNames_tmp2 <- rep(paste0("_P", c(CI_lowerBound, .50, CI_upperBound)), length(colnames(X_groups)))
                        colNames      <- paste0(colNames_tmp1, colNames_tmp2)

                        colnames(coef_ConfInterval) = colNames
                        coef_ConfInterval <- as_tibble(coef_ConfInterval)
                        coef_ConfInterval <- bind_cols(tibble(ID = ID_i), coef_ConfInterval)
                        return(coef_ConfInterval)
                }

        }

        if(!any(c("multiprocess", "multicore", "multisession", "cluster") %in% class(plan()))){
                plan(multisession)
        }

        #test = NULL
        #for(i in 1:30){
        #        print(i)
        #        test[[i]] <- regquantile(data_i = data_split[[i]],
        #                                 formula = formula,
        #                                 tau = tau,
        #                                 groups = groups,
        #                                 confidence_intervals = confidence_intervals,
        #                                 binSampleSize = binSampleSize,
        #                                 CI_lowerBound = CI_lowerBound,
        #                                 CI_upperBound = CI_upperBound,
        #                                 nsim_CI = nsim_CI)
        #}

        #data_split2 = data_split[1:10]

        regquantile_result <- future_map(.x = data_split,
                                         .f = regquantile,
                                         formula = formula,
                                         tau = tau,
                                         groups = groups,
                                         confidence_intervals = confidence_intervals,
                                         binSampleSize = binSampleSize,
                                         CI_lowerBound = CI_lowerBound,
                                         CI_upperBound = CI_upperBound,
                                         dep = dep, indep = indep,
                                         nsim_CI = nsim_CI,
                                         .progress = T,
                                         .options = future_options(globals = c("formula",
                                                                               "tau",
                                                                               "groups",
                                                                               "confidence_intervals",
                                                                               "binSampleSize",
                                                                               "CI_lowerBound",
                                                                               "CI_upperBound",
                                                                               "dep",
                                                                               "indep",
                                                                               "nsim_CI"),
                                                                   packages = c("tidyr",
                                                                                "dplyr",
                                                                                "binsmooth",
                                                                                "quantreg",
                                                                                "data.table")
                                                                   ))

        regquantile_result <- do.call(bind_rows, regquantile_result)

        regquantile_result <- left_join(tibble(ID = IDs),
                                        regquantile_result)

        if(is.null(groups)){
                regquantile_result <- regquantile_result %>%
                        dplyr::select(-ID)
        }else{
                regquantile_result <- regquantile_result %>%
                        separate(col = ID, into = groups, sep = "_")
        }

        regquantile_result

}
antrologos/tableInequality documentation built on May 9, 2023, 1:04 p.m.