R/hello.R

Defines functions stein_beta cluster_map Spdeplisa listw Repglmre2 single_glmre2 Replmre2 single_lmre2 Replm2 single_lm2 .onAttach

Documented in cluster_map listw Repglmre2 Replm2 Replmre2 single_glmre2 single_lm2 single_lmre2 Spdeplisa stein_beta

.onAttach <- function(libname, pkgname) {
  required_packages <- c("dplyr", "ggplot2", "rlang", "sf", "spdep", "viridis", "nlme", "MuMIn", "tidyr", "spData")

  missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]

  if (length(missing_packages) > 0) {
    packageStartupMessage("The following packages are missing and should be installed: ", paste(missing_packages, collapse = ", "))
  }

  packageStartupMessage("DHSr package loaded with all dependencies.")
}

#' Linear Regression Analysis for Specified Location
#'
#' This function runs a linear regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @param location_index The specific location index or number for which the model should be run.
#' @return A dataframe containing the results for the specified location.
#' @examples
#' set.seed(123)
#' if (requireNamespace("dplyr", quietly = TRUE)) {
#'   library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for a specific location (e.g., district 1)
#' result_single_lm <- single_lm2(dummy_data, formula, "district_code",
#'  response_distribution = "normal", location_index = 1)
#'
#' # View the result
#' print(result_single_lm)
#' }
#' @importFrom stats as.formula glm lm na.omit
#' @importFrom sf st_drop_geometry
#' @importFrom dplyr filter sym select all_of
#' @importFrom dplyr %>%
#' @importFrom stats sd setNames
#' @export
single_lm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL, location_index) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    if (response_distribution == "normal") {
      stats::lm(formula, data = subset_data)
    } else if (response_distribution == "other" && !is.null(family)) {
      stats::glm(formula, family = family, data = subset_data)
    } else {
      stop("Unsupported response distribution or missing family parameter for GLM.")
    }
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$coefficients
  coefs <- coef_summary[, "Estimate"]
  se <- coef_summary[, "Std. Error"]

  if (response_distribution == "normal") {
    p_values <- coef_summary[, "Pr(>|t|)"]
    r_squared <- summary(model)$r.squared
    r_squared_df <- data.frame(
      term = "R-squared",
      estimate = r_squared,
      std_error = NA,
      p_value = NA,
      stringsAsFactors = FALSE
    )
  } else {
    p_values <- coef_summary[, "Pr(>|z|)"]
    r_squared_df <- NULL
  }

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  if (!is.null(r_squared_df)) {
    results <- rbind(results, r_squared_df)
  }

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Run Regression Analysis for All Locations
#'
#' This function runs a regression model for all unique locations within a dataset and combines the results.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @return A dataframe containing the combined results for all locations.
#' @examples
#' set.seed(123)
#' library(dplyr)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression across all locations (districts)
#' results1 <- Replm2(dummy_data, formula, "district_code", "normal")
#' print(results1)
#' @importFrom tidyr pivot_wider
#' @export
Replm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL) {
  location_indices <- unique(data[[location_var]])

  all_results <- list()

  for (location_index in location_indices) {
    result <- tryCatch({
      single_lm2(data, formula, location_var, response_distribution, family, location_index)
    }, error = function(e) {
      message(paste("Error processing location index", location_index, ":", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results[[as.character(location_index)]] <- result
    }
  }

  combined_results <- do.call(rbind, all_results)

  reshaped_results <- tidyr::pivot_wider(
    combined_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )

  return(reshaped_results)
}

#' Mixed-Effects Regression Analysis for Specified Location
#'
#' This function runs a mixed-effects regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @return the results for the specified location to test
#' @examples
#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects regression for a specific location (e.g., district 1)
#' result_single_lmre <- single_lmre2(dummy_data, formula, location_var,
#' random_effect_var, location_index = 1)
#'
#' # View the result
#' print(result_single_lmre)
#' @importFrom nlme lme
#' @importFrom MuMIn r.squaredGLMM
#' @export
single_lmre2 <- function(data, formula, location_var, random_effect_var, location_index) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  relevant_vars <- c(relevant_vars, random_effect_var)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    nlme::lme(as.formula(formula), random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data)
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$tTable
  coefs <- coef_summary[, "Value"]
  se <- coef_summary[, "Std.Error"]
  p_values <- coef_summary[, "p-value"]

  r_squared <- MuMIn::r.squaredGLMM(model)

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  r_squared_df <- data.frame(
    term = c("Marginal R-squared", "Conditional R-squared"),
    estimate = c(r_squared[1], r_squared[2]),
    std_error = c(NA, NA),
    p_value = c(NA, NA),
    stringsAsFactors = FALSE
  )
  results <- rbind(results, r_squared_df)

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Mixed-Effects Regression Analysis for All Locations
#'
#' This function runs a mixed-effects regression model for all locations within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @return A dataframe containing the results
#' @examples

#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run mixed-effects regression for all districts
#' results <- DHSr::Replmre2(dummy_data, formula, location_var, random_effect_var)
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Replmre2 <- function(data, formula, location_var, random_effect_var) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  location_indices <- unique(data[[location_var]])
  all_results <- data.frame()

  for (location_index in location_indices) {
    result <- tryCatch({
      DHSr::single_lmre2(data, formula, location_var, random_effect_var, location_index)
    }, error = function(e) {
      message(paste("Error processing location index:", location_index, "-", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results <- rbind(all_results, result)
    }
  }

  reshaped_results <- tidyr::pivot_wider(
    all_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )


  return(reshaped_results)
}

#' Mixed-Effects Logistic Regression Analysis for a Specified Location
#'
#' This function runs a mixed-effects logistic regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression).
#' @return The results for sigle location to test
#' @examples
#' set.seed(123)
#'   library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects logistic regression for a specific location (e.g., district 1)
#' result_single_glmre <- single_glmre2(dummy_data, formula, location_var, random_effect_var,
#'  location_index = 1, family = binomial())
#'
#' # View the result
#' print(result_single_glmre)
#' @importFrom nlme lme
#' @export
single_glmre2 <- function(data, formula, location_var, random_effect_var, location_index, family = NULL) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  relevant_vars <- c(relevant_vars, random_effect_var)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    nlme::lme(fixed = formula, random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data, method = "REML")
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$tTable
  coefs <- coef_summary[, "Value"]
  se <- coef_summary[, "Std.Error"]
  p_values <- coef_summary[, "p-value"]

  r_squared <- MuMIn::r.squaredGLMM(model)

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  r_squared_df <- data.frame(
    term = c("Marginal R-squared", "Conditional R-squared"),
    estimate = c(r_squared[1], r_squared[2]),
    std_error = c(NA, NA),
    p_value = c(NA, NA),
    stringsAsFactors = FALSE
  )

  results <- rbind(results, r_squared_df)

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Loop through all locations and run GLMM for each
#'
#' This function runs a mixed-effects generalized linear model (GLMM) for each location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression, `poisson` for Poisson regression).
#' @return A dataframe containing the results
#' @examples

#' set.seed(123)
#'
#' # Create dummy data
#'   library(dplyr)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the logistic mixed-effects model across all locations (districts)
#' results <- DHSr::Repglmre2(data = dummy_data, formula = formula,
#'                            location_var = location_var, random_effect_var = random_effect_var,
#'                            family = binomial())
#'
#' # Print the results
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Repglmre2 <- function(data, formula, location_var, random_effect_var, family) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  location_indices <- unique(data[[location_var]])
  all_results <- data.frame()

  for (location_index in location_indices) {
    result <- tryCatch({
      DHSr::single_glmre2(data, formula, location_var, random_effect_var, location_index, family)
    }, error = function(e) {
      message(paste("Error processing location index:", location_index, "-", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results <- rbind(all_results, result)
    }
  }

  reshaped_results <- tidyr::pivot_wider(
    all_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )

  return(reshaped_results)
}
#' Create Spatial Weights List
#'
#' This function creates a spatial weights list using a shapefile and a dataset.
#'
#' @param shapefile_path A string specifying the file path to the shapefile.
#' @param data A dataframe containing the variables to be analyzed.
#' @param loc_shape A string specifying the column name in the shapefile used for merging.
#' @param loc_data A string specifying the column name in the dataset that corresponds to the location variable.
#' @param weight_function A function to calculate weights from distances. Defaults to `function(d) exp(-d / 0.2)`.
#' @return A spatial weights list object of class `listw`.
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#'     library(dplyr)
#'     library(sf)
#'
#'     # Load US states data
#'     us_states <- spData::us_states
#'
#'     # Simplify for demonstration: Select a subset of columns
#'     us_states_data <- us_states %>%
#'         select(GEOID, NAME) %>%
#'         mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'     # Define a temporary shapefile path
#'     shapefile_path <- tempfile(fileext = ".shp")
#'     sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'     # Use the listw function from the package
#'     us_states_listw <- DHSr::listw(
#'         shapefile_path = shapefile_path,
#'         data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'         loc_shape = "GEOID",
#'         loc_data = "GEOID",
#'         weight_function = function(d) exp(-d / 0.2)
#'     )
#'
#'     # Verify the spatial weights list
#'     print(us_states_listw)
#' }
#' @importFrom sf st_read st_centroid st_geometry st_coordinates
#' @importFrom dplyr distinct
#' @importFrom stats dist
#' @importFrom spdep mat2listw
#' @export
listw <- function(shapefile_path, data, loc_shape, loc_data, weight_function = function(d) exp(-d / 0.2)) {
  shape_data <- sf::st_read(shapefile_path)

  # Check if loc_shape exists in the data, if not, create it from loc_data
  if (!(loc_shape %in% names(data))) {
    data[[loc_shape]] <- data[[loc_data]]
  }

  shape_data <- dplyr::distinct(shape_data, !!sym(loc_shape), .keep_all = TRUE)

  merged_data <- merge(shape_data, data, by = loc_shape, all.y = TRUE)

  coords <- sf::st_centroid(sf::st_geometry(merged_data))
  coords <- sf::st_coordinates(coords)
  distances <- as.matrix(stats::dist(coords))

  weights <- weight_function(distances)

  diag(weights) <- 0

  listw <- spdep::mat2listw(weights, style = "W")

  return(listw)
}

#' Calculate Local Moran's I and Sign Combination Variables
#'
#' This function calculates Local Moran's I for a specified variable in a dataset
#' and creates sign combination variables based on the standardized variable
#' and the local Moran's I values.
#'
#' @param data A dataframe containing the spatial data.
#' @param variable_name A string representing the name of the variable to be analyzed.
#' @param listw A listw object containing spatial weights for the dataset.
#' @return A data frame containing the original data with additional columns:
#' \item{lisa_I}{Local Moran's I values for the specified variable.}
#' \item{lisa_p}{P-values corresponding to the Local Moran's I values.}
#' \item{z_i}{Standardized values of the input variable.}
#' \item{sign_combination2}{Categories based on the sign of z_i and lisa_I (e.g., "positive-negative").}
#' \item{sign_combination3}{Categories based on the sign of z_i and lisa_I (e.g., "High-High").}
#' @examples
#' # Load necessary libraries
#' if (requireNamespace("spData", quietly = TRUE)) {
#'   library(spData)
#'   library(sf)
#'   library(dplyr)
#'
#'   # Use US states data as a substitute for a shapefile
#'   us_states <- spData::us_states
#'
#'   # Simplify for demonstration: Select a subset of columns
#'   us_states_data <- us_states %>%
#'     select(GEOID, NAME) %>%
#'     mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'   # Define a temporary shapefile path
#'   shapefile_path <- tempfile(fileext = ".shp")
#'   sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'   # Create spatial weights using the listw function from the package
#'   us_states_listw <- DHSr::listw(
#'     shapefile_path = shapefile_path,
#'     data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'     loc_shape = "GEOID",
#'     loc_data = "GEOID",
#'     weight_function = function(d) exp(-d / 0.2)
#'   )
#'
#'   # Apply the Spdeplisa function
#'   lisa_result <- DHSr::Spdeplisa(
#'     data = us_states_data,
#'     variable_name = "mean_wealth",
#'     listw = us_states_listw
#'   )
#'
#'   # View the result
#'   head(lisa_result)
#' }
#' @importFrom spdep localmoran
#' @importFrom dplyr mutate case_when
#' @export
Spdeplisa <- function(data, variable_name, listw) {
  data[[variable_name]][is.na(data[[variable_name]])] <- 0

  local_moran <- spdep::localmoran(data[[variable_name]], listw)

  data$lisa_I <- local_moran[, 1]
  data$lisa_p <- local_moran[, 5]

  data$z_i <- (data[[variable_name]] - mean(data[[variable_name]], na.rm = TRUE)) / stats::sd(data[[variable_name]], na.rm = TRUE)

  data <- dplyr::mutate(
    data,
    sign_combination2 = dplyr::case_when(
      data$z_i > 0 & data$lisa_I < 0 ~ "positive-negative",
      data$z_i < 0 & data$lisa_I < 0 ~ "negative-positive",
      TRUE ~ "others"
    ),
    sign_combination3 = dplyr::case_when(
      data$z_i > 0 & data$lisa_I > 0 ~ "High-High",
      data$z_i < 0 & data$lisa_I > 0 ~ "Low-Low",
      TRUE ~ "others"
    )
  )

  return(data)
}

# Declare global variables to suppress R CMD check notes
utils::globalVariables(c("cluster_id", "cluster_area"))

#' Create Cluster Map Based on Local Moran's I
#'
#' This function creates a map of clusters based on Local Moran's I values. It identifies
#' clusters using Queen contiguity and visualizes them on a map.
#'
#' @param dataset A spatial dataset of class `sf`.
#' @param lisa_value The name of the variable in the dataset containing Local Moran's I values.
#' @param lisa_label The name of the variable in the dataset containing the LISA label.
#' @param label The specific label to filter clusters.
#' @param lisa_cutoff A numeric value specifying the cutoff for LISA values.
#' @param location_var The variable name indicating the primary location in the dataset.
#' @param location_name The name of the variable for the location names.
#' @param level2 An optional second level of location hierarchy. Default is `NULL`.
#' @param id_start The starting value for cluster IDs. Default is `0`.
#' @param comparison The comparison operator for filtering (`>`, `<`, `>=`, etc.). Default is `>`.
#' @param min_area Minimum area required for a cluster to be considered valid. Default is `0`.
#' @param min_ Minimum number of districts required for a cluster to be valid. Default is `5`.
#' @param title The title of the map. Default is `"Clusters Based on Queen Contiguity"`.
#' @param subtitle A subtitle for the map. Default is `""`.
#' @param footnote A footnote for the map. Default is `""`.
#' @param legend_position The position of the legend on the map. Default is `"bottom"`.
#' @param color_scheme The color scheme for the map. Default is `"C"`.
#' @return A list with the following components:
#' \item{dataset_with_clusters}{An `sf` object containing the dataset with assigned cluster IDs.}
#' \item{summary_clusters}{A data frame summarizing cluster information, including regions, area, and the number of locations.}
#' \item{plot}{A `ggplot` object for visualizing the clusters.}
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#'   library(sf)
#'   library(dplyr)
#'   library(spdep)
#'   library(spData)
#'   library(ggplot2)
#'
#'   # Load US states data from spData
#'   us_states <- spData::us_states
#'
#'   # Simplify for demonstration: Select a subset of columns
#'   us_states_data <- us_states %>%
#'     select(GEOID, NAME) %>%
#'     mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'   # Define a shapefile path
#'   shapefile_path <- tempfile(fileext = ".shp")
#'   sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'   # Corrected listw function call using your package
#'   us_states_listw <- DHSr::listw(
#'     shapefile_path = shapefile_path,
#'     data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'     loc_shape = "GEOID",
#'     loc_data = "GEOID",
#'     weight_function = function(d) exp(-d / 0.2)
#'   )
#'
#'   # Apply Spdeplisa function
#'   lisa_result <- DHSr::Spdeplisa(
#'     data = us_states_data,
#'     variable_name = "mean_wealth",
#'     listw = us_states_listw
#'   )
#'
#'   # Add LISA labels
#'   lisa_result <- lisa_result %>%
#'     mutate(lisa_label = case_when(
#'       lisa_I > 0 ~ "High-High",
#'       lisa_I < 0 ~ "Low-Low",
#'       TRUE ~ "Others"
#'     ))
#'
#'   # Apply cluster_map function
#'   cluster_map_result <- DHSr::cluster_map(
#'     dataset = lisa_result,
#'     lisa_value = "lisa_I",
#'     lisa_label = "lisa_label",
#'     label = "High-High",
#'     lisa_cutoff = 0.5,
#'     location_var = "GEOID",
#'     location_name = "NAME",
#'     id_start = 1,
#'     comparison = ">",
#'     min_area = 0,
#'     min_ = 3,  # Reduced for smaller demonstration
#'     title = "Clusters Based on Queen Contiguity",
#'     subtitle = "High-High Clusters",
#'     footnote = "Generated using DHSr package",
#'     legend_position = "bottom",
#'     color_scheme = "C"
#'   )
#'
#'   # View the resulting dataset with clusters
#'   head(cluster_map_result$dataset_with_clusters)
#'
#'   # View the cluster summary
#'   print(cluster_map_result$summary_clusters)
#'
#'   # Plot the clusters
#'   print(cluster_map_result$plot)
#' }
#' @importFrom dplyr filter sym select mutate group_by summarize ungroup distinct
#' @importFrom sf st_drop_geometry st_centroid st_geometry st_coordinates st_as_sf
#' @importFrom spdep poly2nb n.comp.nb mat2listw
#' @importFrom ggplot2 ggplot geom_sf aes scale_fill_manual labs theme_minimal theme element_rect
#' @importFrom viridis viridis_pal
#' @importFrom rlang parse_expr
#' @export
cluster_map <- function(dataset, lisa_value, lisa_label, label, lisa_cutoff, location_var, location_name, level2 = NULL, id_start = 0, comparison = ">", min_area = 0, min_ = 5,
                        title = "Clusters Based on Queen Contiguity", subtitle = "", footnote = "", legend_position = "bottom", color_scheme = "C") {
  if (!inherits(dataset, "sf")) {
    stop("The dataset must be an sf object")
  }

  filter_expr <- rlang::parse_expr(paste0("`", lisa_value, "`", comparison, lisa_cutoff))

  filtered_dataset <- dplyr::filter(dataset, !!rlang::sym(lisa_label) == label & !!filter_expr) %>%
    sf::st_as_sf()

  queen_nb <- spdep::poly2nb(filtered_dataset)

  components <- spdep::n.comp.nb(queen_nb)$comp.id

  filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = components + id_start)

  filtered_dataset <- dplyr::group_by(filtered_dataset, cluster_id) %>%
    dplyr::mutate(cluster_area = sum(as.numeric(sf::st_area(sf::st_geometry(filtered_dataset))))) %>%
    dplyr::ungroup()

  filtered_dataset$cluster_area <- as.numeric(filtered_dataset$cluster_area)

  cluster_sizes <- table(filtered_dataset$cluster_id)
  valid_clusters <- names(cluster_sizes[cluster_sizes >= min_])
  filtered_dataset <- dplyr::filter(filtered_dataset, cluster_id %in% valid_clusters & cluster_area >= min_area)

  filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = as.factor(as.numeric(factor(cluster_id))))

  if (!is.null(level2)) {
    filtered_dataset_simple <- filtered_dataset %>%
      sf::st_drop_geometry() %>%
      dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), !!rlang::sym(level2), cluster_id, cluster_area) %>%
      dplyr::distinct(!!rlang::sym(location_var), !!rlang::sym(level2), .keep_all = TRUE)

    dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = c(location_var, level2), by.y = c(location_var, level2), all.x = TRUE)
  } else {
    filtered_dataset_simple <- filtered_dataset %>%
      sf::st_drop_geometry() %>%
      dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), cluster_id, cluster_area) %>%
      dplyr::distinct(!!rlang::sym(location_var), .keep_all = TRUE)

    dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = location_var, by.y = location_var, all.x = TRUE)
  }

  dataset_with_clusters <- sf::st_as_sf(dataset_with_clusters)

  if (!is.null(level2)) {
    summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
      dplyr::summarize(
        regions = paste(unique(paste(!!rlang::sym(location_name), "(", !!rlang::sym(level2), ")", sep = "")), collapse = ", "),
        cluster_area = unique(cluster_area),
        num_ = dplyr::n()
      ) %>%
      dplyr::ungroup()
  } else {
    summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
      dplyr::summarize(
        regions = paste(unique(!!rlang::sym(location_name)), collapse = ", "),
        cluster_area = unique(cluster_area),
        num_ = dplyr::n()
      ) %>%
      dplyr::ungroup()
  }

  unique_cluster_ids <- unique(dataset_with_clusters$cluster_id[!is.na(dataset_with_clusters$cluster_id)])
  fill_colors <- c(stats::setNames(viridis::viridis_pal(option = color_scheme)(length(unique_cluster_ids)), unique_cluster_ids), "white")

  plot <- ggplot2::ggplot(dataset_with_clusters) +
    ggplot2::geom_sf(ggplot2::aes(fill = cluster_id), color = NA) +
    ggplot2::scale_fill_manual(values = fill_colors, na.value = "white", name = "Cluster ID", drop = FALSE) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = footnote) +
    ggplot2::theme_minimal() +
    ggplot2::theme(legend.position = legend_position, panel.background = ggplot2::element_rect(fill = "lightgrey"), plot.background = ggplot2::element_rect(fill = "lightgrey"))

  list(dataset_with_clusters = dataset_with_clusters, summary_clusters = summary_clusters, plot = plot)
}

utils::globalVariables(c("mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares", "lambda_d"))

#' Calculate Stein's Beta for Each Cluster
#'
#' This function calculates Stein's Beta for each cluster within the dataset.
#' It applies Stein's shrinkage estimator to the specified beta estimates within each cluster.
#'
#' @param data A dataframe containing the data.
#' @param cluster_id The name of the column representing the cluster IDs.
#' @param beta The name of the column representing the beta estimates.
#' @return A data frame containing the input data with additional columns:
#' \item{stein_beta}{The Stein-shrinkage adjusted beta values.}
#' \item{lambda_d}{Shrinkage factors for each cluster.}
#' \item{mu_beta_m}{Mean beta values for each cluster.}
#' \item{sigma_hat_sq}{Estimated variance of the beta values within clusters.}
#' \item{sum_of_squares}{Sum of squared deviations of beta values from their mean.}
#' @examples
#' # Create dummy data
#' library(dplyr)
#' set.seed(123)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for all districts
#' results1 <- DHSr::Replm2(dummy_data, formula, "district_code", "normal")
#'
#' # Assign random clusters for demonstration
#' clusters <- data.frame(
#'   district_code = unique(dummy_data$district_code),
#'   cluster_id = sample(1:3, length(unique(dummy_data$district_code)), replace = TRUE)
#' )
#'
#' # Merge clusters with regression results
#' cluster_beta <- merge(clusters, results1, by.x = "district_code", by.y = "location")
#'
#' # Apply Stein Beta shrinkage
#' results_with_stein_beta <- DHSr::stein_beta(
#'   data = cluster_beta,
#'   cluster_id = "cluster_id",                # Column for cluster IDs
#'   beta = "estimate_gender_female"          # Column for beta estimates
#' )
#'
#' # View results
#' print(head(results_with_stein_beta))
#' @importFrom dplyr filter mutate group_by summarize ungroup sym
#' @export
stein_beta <- function(data, cluster_id, beta) {
  data <- dplyr::filter(data, !is.na(!!rlang::sym(cluster_id)) & !is.na(!!rlang::sym(beta)))

  data <- dplyr::group_by(data, !!rlang::sym(cluster_id)) %>%
    dplyr::mutate(
      n_k = dplyr::n(),
      mu_beta_m = mean(!!rlang::sym(beta)),
      sigma_hat_sq = stats::var(!!rlang::sym(beta)),
      sum_of_squares = sum((!!rlang::sym(beta) - mu_beta_m)^2),
      lambda_d = max(0, 1 - (n_k - 2) * sigma_hat_sq / sum_of_squares),
      stein_beta = lambda_d * mu_beta_m + (1 - lambda_d) * !!rlang::sym(beta)
    ) %>%
    dplyr::ungroup()

  return(data)
}

Try the DHSr package in your browser

Any scripts or data that you put into this service are public.

DHSr documentation built on April 4, 2025, 12:18 a.m.