lis: InnovaLab Toolkit

Documented in vulnerability_index

#' Vulnerability Index
#'
#' @description It allows the observations to be ordered by different methods (default
#'   percentile rank), and in consideration of domains grouping indicators.
#'
#' @param data A dataframe containing the variables that will be analyzed.
#'
#' @param direct A vector indicating the name of the variables that contribute
#'    directly to the vulnerability index.
#'
#' @param inverse A vector indicating the name of the variables that contribute
#'    inversely to the vulnerability index.
#'
#' @param table A dataframe indicating the variables "Codename", "Direction",
#'    "Domain" and "Geographic_scale" as a minimum. If this argument is specified,
#'    it is no longer necessary to specify the direct and inverse arguments.
#'
#' @param weighted A logical value indicating whether the "Domain" information is
#'    used for the index calculation.
#'
#' @param domains A vector indicating the names of the domains grouping the variables
#'    of interest. If the argument "table" containing the variable "Domain" is used,
#'    only "default" is indicated in this argument.
#'
#' @param level A vector indicating the level of analysis. It currently supports 3
#'    values:"departamental", "provincial", and "distrital".
#'
#' @param ordered A logical value indicating whether the cases analyzed will be
#'    ordered from highest to lowest.
#'
#' @param method A vector indicating the analysis method used for the index.
#'    Currently only "percent_rank" is supported.
#'
#' @param complete If true, the results table will show all the variables that
#'    the dataframe has in addition to the computations performed.
#'
#' @param na.rm A logical value indicating whether cases where missing values are
#'    eliminated.
#'
#' @return A \code{\link[tibble]{tibble}} containing the variables analyzed by the function.
#' @seealso \code{\link[dplyr]{percent_rank}}, \code{\link[psych]{principal}}
#'
#' @import dplyr
#' @importFrom purrr map map2 reduce
#' @importFrom magrittr use_series
#' @importFrom stringr str_sub str_to_lower str_replace
#' @importFrom rlang .data
#' @importFrom tidyr nest unnest drop_na separate
#' @importFrom psych principal
#' @importFrom methods new
#' @importFrom stats cor
#' @export
#'
#' @examples
#' \dontrun{
#' Sigma_Dom1 <- matrix(rep(c(1, runif(4, 0.65, 0.9)), 4),
#'                      4, 4, byrow = TRUE)
#'
#' Sigma_Dom2 <- matrix(rep(c(1, runif(4, 0.65, 0.9)), 4),
#'                      4, 4, byrow = TRUE)
#'
#' Dom1 <- as.data.frame(MASS::mvrnorm(100, rep(0, 4), Sigma_Dom1))
#' Dom2 <- as.data.frame(MASS::mvrnorm(100, rep(0, 4), Sigma_Dom2))
#' colnames(Dom2) <- paste0("V", 5:8)
#'
#' df_example <- data.frame(distr = paste0("distr", 1:100))
#' df_example <- cbind(df_example, Dom1, Dom2)
#'
#' table_var <- data.frame(
#'   Codename = paste0("V", 1:8),
#'   Direction = "Direct",
#'   Domain = c(rep("Dom1", 4), rep("Dom2", 4)),
#'   Geographic_scale = "Distrital"
#' )
#'
#' vulnerability_index(df_example, table = table_var,
#'                     level = "distrital", na.rm = TRUE,
#'                     method = "pca")
#'
#'
#' vulnerability_index(df_example, table = table_var,
#'                     level = "distrital", na.rm = TRUE,
#'                     method = "percent_rank")
#' }

vulnerability_index <- function(data, direct = NULL, inverse = NULL, table = NULL,
                                weighted = TRUE, domains = 'default', level = NULL,
                                ordered = TRUE, method = "percent_rank", complete = TRUE,
                                na.rm = FALSE) {
  # Checking
  if (is.null(table) & (is.null(direct) | is.null(inverse))) {
    stop(paste("It is necessary that direct and inverse contain information",
               "or that the table argument is specified."))
  }

  if (!is.null(table) & (!is.null(direct) | !is.null(inverse))) {
    stop(paste("It is only necessary to specify direct/inverse or table.",
               "Not all 3 arguments at the same time."))
  }

  if(is.null(level)) {
    stop(paste("Now it is necessary to indicate the level of analysis to",
               "be performed. Three values are allowed: distrital, provincial",
               "departamental"))
  } else if (!level %in% c("distrital", "provincial", "departamental")) {
    stop(paste("Only 3 values are allowed: distrital, privincial, departamental."))
  }

  ## Options

  indexoptions <- list()
  indexoptions$direct <- direct
  indexoptions$inverse <- inverse
  indexoptions$table <- table
  indexoptions$weighted <- weighted
  indexoptions$domains <- domains
  indexoptions$level <- level
  indexoptions$ordered <- ordered
  indexoptions$method <- method
  indexoptions$complete <- complete
  indexoptions$na.rm <- na.rm

  # Get data about variable's direction
  # Temp only table argument is support

  if(level == "distrital") {
    table <- table %>%
      filter(.data$Geographic_scale == "Distrital")
  } else if (level == "provincial") {
    table <- table %>%
      filter(.data$Geographic_scale == "Distrital")
  }

  if(!is.null(table)) {
    direct <- table %>%
      filter(.data$Direction == "Direct") %>%
      use_series(Codename)

    inverse <- table %>%
      filter(.data$Direction == "Inverse") %>%
      use_series(Codename)
  }

  if (weighted & domains == 'default') {
    n_domains <- length(table(table$Domain))
    names_domains <- str_sub(str_to_lower(sort(unique(table$Domain))), 1, 6)
    # names_domains <- paste0("rank_", names_domains)
    # if (method == "percent_rank") {
    #   names_domains <- paste0("rank_", names_domains)
    # } else if (method == "pca") {
    #   names_domains <- paste0("pca_", names_domains)
    # }
    vars_domains <- table %>%
      group_nest(.data$Domain) %>%
      use_series(data) %>%
      map(~ .x$Codename)
    vars_vulnerability <- table$Codename
  }

  data <- data %>%
    select(starts_with("dep"),
           starts_with("prov"),
           starts_with("distr"),
           {{ vars_vulnerability }})

  # Function body

  if (method == "percent_rank") {

    names_domains <- paste0("rank_", names_domains)

    if (weighted) {
      df_rank <- data %>%
        mutate(
          across({{ direct }}, percent_rank, .names = "rank_{.col}"),
          across({{ inverse }}, ~ percent_rank(desc(.)), .names = "rank_{.col}")
        ) %>%
        rowwise()

      for (i in 1:n_domains) {
        name_domain_C <- names_domains[i]
        var_domain <- vars_domains[[i]]
        var_domain <- paste0("rank_", var_domain)

        df_rank <- df_rank %>%
          mutate(
            {{ name_domain_C }} := sum(c_across({{ var_domain }}), na.rm = na.rm)
          )
      }

      rank_vars_vulnerability <- paste0("rank_", vars_vulnerability)

      df_rank <- df_rank %>%
        ungroup() %>%
        mutate(
          across( {{names_domains}}, percent_rank)
        ) %>%
        rowwise() %>%
        mutate(
          sumr = sum(c_across({{ names_domains }}), na.rm = na.rm)
        ) %>%
        ungroup() %>%
        mutate(
          Rank_T = percent_rank(.data$sumr) # Ranking percentil global
        ) %>%
        select(-c(
          #{{ vars_vulnerability }},
          {{ rank_vars_vulnerability }},
          .data$sumr
        )) %>%
        relocate(
          starts_with("dep"),
          starts_with("prov"),
          starts_with("distr"),
          {{ names_domains }},
          .data$Rank_T
        )

    } else {
      df_rank <- data %>%
        mutate(
          across({{ direct }}, percent_rank),
          across({{ inverse }}, percent_rank)
        ) %>%
        rowwise() %>%
        mutate(
          sumr = sum(c_across(c({{ direct }}, {{ inverse }})), na.rm = na.rm)
        ) %>%
        ungroup() %>% # desagrupar
        mutate(
          Rank_T = percent_rank(.data$sumr) # Ranking percentil global
        ) %>%
        select(-c(.data$sumr))
      # select(dep:distr, {{ direct }}, {{ inverse }}, Rank_T)
    }

  } else if (method == "pca") {

    names_domains_data <- paste0("data_", names_domains)
    names_domains_pca <- paste0("pca_", names_domains)
    names_domains_aug <- paste0("fitted_", names_domains)
    # names_domains_loadings <- paste0("tidy_", names_domains)
    # names_domains_explained <- paste0("explained_", names_domains)
    # names_domains_fit <- paste0("fit_", names_domains)

    df_rank <- data %>%
      nest(data = everything())

    df_rank_list <- list()
    df_rank_list_g <- list()

    for (i in 1:n_domains) {
      name_domain_data <- names_domains_data[i]
      name_domain_C_pca <- names_domains_pca[i]
      name_domain_aug <- names_domains_aug[i]
      # name_domains_loadings <- names_domains_loadings[i]
      # name_domains_explained <- names_domains_explained[i]
      # name_domains_fit <- names_domains_fit[i]

      var_domain <- vars_domains[[i]]

      df_rank_list_g[[i]] <- df_rank %>%
        mutate(
          {{ name_domain_data }} := map(data,
                                        ~ .x %>%
                                          select(starts_with("dep"),
                                                 starts_with("prov"),
                                                 starts_with("distr"),
                                                 {{ var_domain }}) %>%
                                          drop_na()),
          {{ name_domain_C_pca }} := map(eval(parse(text = name_domain_data)),
                                         ~ .x %>%
                                           select({{ var_domain }}) %>%
                                           mutate(
                                             across(
                                               .cols = c({{ var_domain }}),
                                               .fns   = ~ scale(.)[,1]
                                             )
                                           ) %>%
                                           principal(r = .,
                                                     nfactors = 1)),
          {{ name_domain_aug }} := map2(eval(parse(text = name_domain_C_pca)),
                                        eval(parse(text = name_domain_data)),
                                        ~ .y %>%
                                          bind_cols(
                                            .x$scores %>%
                                              as_tibble()
                                          ) %>%
                                          rename({{ name_domain_C_pca }} := .data$PC1)),
          loadings = map(eval(parse(text = name_domain_C_pca)),
                         ~ .x$loadings %>%
                           unclass() %>%
                           as_tibble(rownames = "Indicators") %>%
                           mutate(
                             h2 = .x$communality,
                             u2 = .x$uniquenesses,
                             com = .x$complexity
                          )),
          explained = map(eval(parse(text = name_domain_C_pca)),
                          ~ .x$Vaccounted %>%
                            as_tibble(rownames = "Variance")),
          fit = map2(eval(parse(text = name_domain_C_pca)),
                     eval(parse(text = name_domain_data)),
                     ~ .y %>%
                       select({{ var_domain }}) %>%
                       fit_measures(.x, .))
        )

      df_rank_list[[i]] <- df_rank_list_g[[i]] %>%
        select({{ name_domain_aug }}) %>%
        unnest(cols = {{ name_domain_aug }})
    }

    df_rank <- df_rank_list %>%
      reduce(full_join) %>%
      drop_na()

    df_rank_g <- df_rank %>%
      nest(data = everything()) %>%
      mutate(
        pca = map(data,
                  ~ .x %>%
                    select({{ names_domains_pca }}) %>%
                    principal(r = .,
                              nfactors = 1)),
        fitted = map2(.data$pca, .data$data,
                      ~ .y %>%
                        bind_cols(
                          .x$scores %>%
                            as_tibble()
                        ) %>%
                        rename(fit_g := .data$PC1)),
        loadings = map(.data$pca,
                       ~ .x$loadings %>%
                         unclass() %>%
                         as_tibble(rownames = "Indicators") %>%
                         mutate(
                           h2 = .x$communality,
                           u2 = .x$uniquenesses,
                           com = .x$complexity
                         )),
        explained = map(.data$pca,
                        ~ .x$Vaccounted %>%
                          as_tibble(rownames = "Variance")),
        fit = map2(.data$pca, .data$data,
                   ~ .y %>%
                     select({{ names_domains_pca }}) %>%
                     fit_measures(.x, .))
      )

    df_rank <- df_rank_g %>%
      select(.data$fitted) %>%
      unnest(cols = .data$fitted)

    # Detect direction of domains
    direction_loadings <- df_rank_g$loadings[[1]] %>%
      mutate(
        Direction = ifelse(.data$PC1 >= 0,
                           "Direct",
                           "Inverse")
      )

    direct_domains_loading <- direction_loadings %>%
      filter(.data$Direction == "Direct") %>%
      pull(.data$Indicators)

    names_direct_domains_loading <- str_replace(direct_domains_loading,
                                                "pca_", "rank_")

    inverse_domains_loading <- direction_loadings %>%
      filter(.data$Direction == "Inverse") %>%
      pull(.data$Indicators)

    names_inverse_domains_loading <- str_replace(inverse_domains_loading,
                                                "pca_", "rank_")

    #######################################################

    names_domains_rank <- paste0("rank_", names_domains)

    names_domains_pca_rank <- data.frame(string = c(names_domains_pca,
                                                    names_domains_rank)) %>%
      separate(.data$string, c("type", "variable"),
               remove = FALSE) %>%
      arrange(.data$variable) %>%
      pull(string)

    df_rank <- df_rank %>%
      mutate(
        Rank_T = percent_rank(.data$fit_g),
        across({{ direct_domains_loading }}, percent_rank,
               .names = "{names_direct_domains_loading}"),
        across({{ inverse_domains_loading }}, ~ percent_rank(desc(.)),
               .names = "{names_inverse_domains_loading}"),
      ) %>%
      relocate(starts_with("dep"),
               starts_with("prov"),
               starts_with("distr"),
               {{ names_domains_pca_rank }},
               .data$fit_g, .data$Rank_T)
  }

  if (ordered) {
    df_rank <- df_rank %>%
      arrange(desc(.data$Rank_T))
  }

  if (complete == FALSE) {
    df_rank <- df_rank %>%
      select(-c({{ vars_vulnerability }}))
  }

  if (method == "percent_rank") {

    indexcalc <- new("indexcalc",
                     Options      = indexoptions,          # list
                     Data         = df_rank             # S4 class
    )

  } else if (method == "pca") {

    Specificfit <- list()

    for (i in 1:n_domains) {
      names_domain <- names_domains[[i]]
      Specificfit[[names_domain]] <- new("indexFit",
                                         Fits = df_rank_list_g[[i]]$fit[[1]],
                                         Explained = df_rank_list_g[[i]]$explained[[1]],
                                         Loadings = df_rank_list_g[[i]]$loadings[[1]]
      )
    }

    indexfits <- new("indexFits",
                     Specific = Specificfit,
                     Fits = df_rank_g$fit[[1]],
                     Explained = df_rank_g$explained[[1]],
                     Loadings = df_rank_g$loadings[[1]]
    )

    indexcalc <- new("indexcalc",
                     Options      = indexoptions,          # list
                     Data         = df_rank,             # S4 class
                     Fit          = indexfits              # S4 class
    )
  }

  return(indexcalc)

}


fit_measures <- function(pca, data) {
  bartlett <- data %>%
    cor() %>%
    psych::cortest.bartlett(., nrow(data))

  bartlett <- tibble(
    Chisq = bartlett$chisq,
    df = bartlett$df,
    p_value = bartlett$p.value
  )

  KMO <- data %>%
    cor() %>%
    psych::KMO()

  KMO <- tibble(
    Indicators = "Overall",
    KMO = KMO$MSA
  ) %>%
    bind_rows(
      KMO$MSAi %>%
        as_tibble(rownames = "Indicators") %>%
        rename(KMO = .data$value)
    )

  Chi <- tibble(
    Chisq = pca$chi,
    p_value = pca$EPVAL
  )

  Fit <- list(
    bartlett = bartlett,
    KMO = KMO,
    Chi = Chi,
    RMSR = pca$rms,
    Fit_off = pca$fit.off
  )

  return(Fit)
}