Nothing
.onAttach <- function(libname, pkgname) {
required_packages <- c("dplyr", "ggplot2", "rlang", "sf", "spdep", "viridis", "nlme", "MuMIn", "tidyr", "spData")
missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing_packages) > 0) {
packageStartupMessage("The following packages are missing and should be installed: ", paste(missing_packages, collapse = ", "))
}
packageStartupMessage("DHSr package loaded with all dependencies.")
}
#' Linear Regression Analysis for Specified Location
#'
#' This function runs a linear regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @param location_index The specific location index or number for which the model should be run.
#' @return A dataframe containing the results for the specified location.
#' @examples
#' set.seed(123)
#' if (requireNamespace("dplyr", quietly = TRUE)) {
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for a specific location (e.g., district 1)
#' result_single_lm <- single_lm2(dummy_data, formula, "district_code",
#' response_distribution = "normal", location_index = 1)
#'
#' # View the result
#' print(result_single_lm)
#' }
#' @importFrom stats as.formula glm lm na.omit
#' @importFrom sf st_drop_geometry
#' @importFrom dplyr filter sym select all_of
#' @importFrom dplyr %>%
#' @importFrom stats sd setNames
#' @export
single_lm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL, location_index) {
if ("geometry" %in% colnames(data)) {
data <- sf::st_drop_geometry(data)
}
subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)
if ("geometry" %in% colnames(subset_data)) {
subset_data <- sf::st_drop_geometry(subset_data)
}
relevant_vars <- all.vars(formula)
subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))
subset_data <- stats::na.omit(subset_data)
var_lengths <- sapply(subset_data, length)
if (any(var_lengths != var_lengths[1])) {
stop("Variable lengths differ in the subset data.")
}
model <- tryCatch({
if (response_distribution == "normal") {
stats::lm(formula, data = subset_data)
} else if (response_distribution == "other" && !is.null(family)) {
stats::glm(formula, family = family, data = subset_data)
} else {
stop("Unsupported response distribution or missing family parameter for GLM.")
}
}, error = function(e) {
stop(paste("Error fitting model:", e$message))
})
coef_summary <- summary(model)$coefficients
coefs <- coef_summary[, "Estimate"]
se <- coef_summary[, "Std. Error"]
if (response_distribution == "normal") {
p_values <- coef_summary[, "Pr(>|t|)"]
r_squared <- summary(model)$r.squared
r_squared_df <- data.frame(
term = "R-squared",
estimate = r_squared,
std_error = NA,
p_value = NA,
stringsAsFactors = FALSE
)
} else {
p_values <- coef_summary[, "Pr(>|z|)"]
r_squared_df <- NULL
}
results <- data.frame(
term = rownames(coef_summary),
estimate = coefs,
std_error = se,
p_value = p_values,
stringsAsFactors = FALSE
)
if (!is.null(r_squared_df)) {
results <- rbind(results, r_squared_df)
}
results$location <- location_index
return(results)
}
utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))
#' Run Regression Analysis for All Locations
#'
#' This function runs a regression model for all unique locations within a dataset and combines the results.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @return A dataframe containing the combined results for all locations.
#' @examples
#' set.seed(123)
#' library(dplyr)
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression across all locations (districts)
#' results1 <- Replm2(dummy_data, formula, "district_code", "normal")
#' print(results1)
#' @importFrom tidyr pivot_wider
#' @export
Replm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL) {
location_indices <- unique(data[[location_var]])
all_results <- list()
for (location_index in location_indices) {
result <- tryCatch({
single_lm2(data, formula, location_var, response_distribution, family, location_index)
}, error = function(e) {
message(paste("Error processing location index", location_index, ":", e$message))
return(NULL)
})
if (!is.null(result)) {
all_results[[as.character(location_index)]] <- result
}
}
combined_results <- do.call(rbind, all_results)
reshaped_results <- tidyr::pivot_wider(
combined_results,
names_from = term,
values_from = c(estimate, std_error, p_value),
names_sep = "_"
)
return(reshaped_results)
}
#' Mixed-Effects Regression Analysis for Specified Location
#'
#' This function runs a mixed-effects regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @return the results for the specified location to test
#' @examples
#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects regression for a specific location (e.g., district 1)
#' result_single_lmre <- single_lmre2(dummy_data, formula, location_var,
#' random_effect_var, location_index = 1)
#'
#' # View the result
#' print(result_single_lmre)
#' @importFrom nlme lme
#' @importFrom MuMIn r.squaredGLMM
#' @export
single_lmre2 <- function(data, formula, location_var, random_effect_var, location_index) {
if ("geometry" %in% colnames(data)) {
data <- sf::st_drop_geometry(data)
}
subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)
if ("geometry" %in% colnames(subset_data)) {
subset_data <- sf::st_drop_geometry(subset_data)
}
relevant_vars <- all.vars(formula)
relevant_vars <- c(relevant_vars, random_effect_var)
subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))
subset_data <- stats::na.omit(subset_data)
subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])
var_lengths <- sapply(subset_data, length)
if (any(var_lengths != var_lengths[1])) {
stop("Variable lengths differ in the subset data.")
}
model <- tryCatch({
nlme::lme(as.formula(formula), random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data)
}, error = function(e) {
stop(paste("Error fitting model:", e$message))
})
coef_summary <- summary(model)$tTable
coefs <- coef_summary[, "Value"]
se <- coef_summary[, "Std.Error"]
p_values <- coef_summary[, "p-value"]
r_squared <- MuMIn::r.squaredGLMM(model)
results <- data.frame(
term = rownames(coef_summary),
estimate = coefs,
std_error = se,
p_value = p_values,
stringsAsFactors = FALSE
)
r_squared_df <- data.frame(
term = c("Marginal R-squared", "Conditional R-squared"),
estimate = c(r_squared[1], r_squared[2]),
std_error = c(NA, NA),
p_value = c(NA, NA),
stringsAsFactors = FALSE
)
results <- rbind(results, r_squared_df)
results$location <- location_index
return(results)
}
utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))
#' Mixed-Effects Regression Analysis for All Locations
#'
#' This function runs a mixed-effects regression model for all locations within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @return A dataframe containing the results
#' @examples
#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run mixed-effects regression for all districts
#' results <- DHSr::Replmre2(dummy_data, formula, location_var, random_effect_var)
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Replmre2 <- function(data, formula, location_var, random_effect_var) {
if ("geometry" %in% colnames(data)) {
data <- sf::st_drop_geometry(data)
}
location_indices <- unique(data[[location_var]])
all_results <- data.frame()
for (location_index in location_indices) {
result <- tryCatch({
DHSr::single_lmre2(data, formula, location_var, random_effect_var, location_index)
}, error = function(e) {
message(paste("Error processing location index:", location_index, "-", e$message))
return(NULL)
})
if (!is.null(result)) {
all_results <- rbind(all_results, result)
}
}
reshaped_results <- tidyr::pivot_wider(
all_results,
names_from = term,
values_from = c(estimate, std_error, p_value),
names_sep = "_"
)
return(reshaped_results)
}
#' Mixed-Effects Logistic Regression Analysis for a Specified Location
#'
#' This function runs a mixed-effects logistic regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression).
#' @return The results for sigle location to test
#' @examples
#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects logistic regression for a specific location (e.g., district 1)
#' result_single_glmre <- single_glmre2(dummy_data, formula, location_var, random_effect_var,
#' location_index = 1, family = binomial())
#'
#' # View the result
#' print(result_single_glmre)
#' @importFrom nlme lme
#' @export
single_glmre2 <- function(data, formula, location_var, random_effect_var, location_index, family = NULL) {
if ("geometry" %in% colnames(data)) {
data <- sf::st_drop_geometry(data)
}
subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)
if ("geometry" %in% colnames(subset_data)) {
subset_data <- sf::st_drop_geometry(subset_data)
}
relevant_vars <- all.vars(formula)
relevant_vars <- c(relevant_vars, random_effect_var)
subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))
subset_data <- stats::na.omit(subset_data)
subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])
var_lengths <- sapply(subset_data, length)
if (any(var_lengths != var_lengths[1])) {
stop("Variable lengths differ in the subset data.")
}
model <- tryCatch({
nlme::lme(fixed = formula, random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data, method = "REML")
}, error = function(e) {
stop(paste("Error fitting model:", e$message))
})
coef_summary <- summary(model)$tTable
coefs <- coef_summary[, "Value"]
se <- coef_summary[, "Std.Error"]
p_values <- coef_summary[, "p-value"]
r_squared <- MuMIn::r.squaredGLMM(model)
results <- data.frame(
term = rownames(coef_summary),
estimate = coefs,
std_error = se,
p_value = p_values,
stringsAsFactors = FALSE
)
r_squared_df <- data.frame(
term = c("Marginal R-squared", "Conditional R-squared"),
estimate = c(r_squared[1], r_squared[2]),
std_error = c(NA, NA),
p_value = c(NA, NA),
stringsAsFactors = FALSE
)
results <- rbind(results, r_squared_df)
results$location <- location_index
return(results)
}
utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))
#' Loop through all locations and run GLMM for each
#'
#' This function runs a mixed-effects generalized linear model (GLMM) for each location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression, `poisson` for Poisson regression).
#' @return A dataframe containing the results
#' @examples
#' set.seed(123)
#'
#' # Create dummy data
#' library(dplyr)
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the logistic mixed-effects model across all locations (districts)
#' results <- DHSr::Repglmre2(data = dummy_data, formula = formula,
#' location_var = location_var, random_effect_var = random_effect_var,
#' family = binomial())
#'
#' # Print the results
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Repglmre2 <- function(data, formula, location_var, random_effect_var, family) {
if ("geometry" %in% colnames(data)) {
data <- sf::st_drop_geometry(data)
}
location_indices <- unique(data[[location_var]])
all_results <- data.frame()
for (location_index in location_indices) {
result <- tryCatch({
DHSr::single_glmre2(data, formula, location_var, random_effect_var, location_index, family)
}, error = function(e) {
message(paste("Error processing location index:", location_index, "-", e$message))
return(NULL)
})
if (!is.null(result)) {
all_results <- rbind(all_results, result)
}
}
reshaped_results <- tidyr::pivot_wider(
all_results,
names_from = term,
values_from = c(estimate, std_error, p_value),
names_sep = "_"
)
return(reshaped_results)
}
#' Create Spatial Weights List
#'
#' This function creates a spatial weights list using a shapefile and a dataset.
#'
#' @param shapefile_path A string specifying the file path to the shapefile.
#' @param data A dataframe containing the variables to be analyzed.
#' @param loc_shape A string specifying the column name in the shapefile used for merging.
#' @param loc_data A string specifying the column name in the dataset that corresponds to the location variable.
#' @param weight_function A function to calculate weights from distances. Defaults to `function(d) exp(-d / 0.2)`.
#' @return A spatial weights list object of class `listw`.
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#' library(dplyr)
#' library(sf)
#'
#' # Load US states data
#' us_states <- spData::us_states
#'
#' # Simplify for demonstration: Select a subset of columns
#' us_states_data <- us_states %>%
#' select(GEOID, NAME) %>%
#' mutate(mean_wealth = rnorm(nrow(us_states), 50, 10)) # Add mock data
#'
#' # Define a temporary shapefile path
#' shapefile_path <- tempfile(fileext = ".shp")
#' sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#' # Use the listw function from the package
#' us_states_listw <- DHSr::listw(
#' shapefile_path = shapefile_path,
#' data = us_states_data %>% sf::st_drop_geometry(), # Drop geometry for compatibility
#' loc_shape = "GEOID",
#' loc_data = "GEOID",
#' weight_function = function(d) exp(-d / 0.2)
#' )
#'
#' # Verify the spatial weights list
#' print(us_states_listw)
#' }
#' @importFrom sf st_read st_centroid st_geometry st_coordinates
#' @importFrom dplyr distinct
#' @importFrom stats dist
#' @importFrom spdep mat2listw
#' @export
listw <- function(shapefile_path, data, loc_shape, loc_data, weight_function = function(d) exp(-d / 0.2)) {
shape_data <- sf::st_read(shapefile_path)
# Check if loc_shape exists in the data, if not, create it from loc_data
if (!(loc_shape %in% names(data))) {
data[[loc_shape]] <- data[[loc_data]]
}
shape_data <- dplyr::distinct(shape_data, !!sym(loc_shape), .keep_all = TRUE)
merged_data <- merge(shape_data, data, by = loc_shape, all.y = TRUE)
coords <- sf::st_centroid(sf::st_geometry(merged_data))
coords <- sf::st_coordinates(coords)
distances <- as.matrix(stats::dist(coords))
weights <- weight_function(distances)
diag(weights) <- 0
listw <- spdep::mat2listw(weights, style = "W")
return(listw)
}
#' Calculate Local Moran's I and Sign Combination Variables
#'
#' This function calculates Local Moran's I for a specified variable in a dataset
#' and creates sign combination variables based on the standardized variable
#' and the local Moran's I values.
#'
#' @param data A dataframe containing the spatial data.
#' @param variable_name A string representing the name of the variable to be analyzed.
#' @param listw A listw object containing spatial weights for the dataset.
#' @return A data frame containing the original data with additional columns:
#' \item{lisa_I}{Local Moran's I values for the specified variable.}
#' \item{lisa_p}{P-values corresponding to the Local Moran's I values.}
#' \item{z_i}{Standardized values of the input variable.}
#' \item{sign_combination2}{Categories based on the sign of z_i and lisa_I (e.g., "positive-negative").}
#' \item{sign_combination3}{Categories based on the sign of z_i and lisa_I (e.g., "High-High").}
#' @examples
#' # Load necessary libraries
#' if (requireNamespace("spData", quietly = TRUE)) {
#' library(spData)
#' library(sf)
#' library(dplyr)
#'
#' # Use US states data as a substitute for a shapefile
#' us_states <- spData::us_states
#'
#' # Simplify for demonstration: Select a subset of columns
#' us_states_data <- us_states %>%
#' select(GEOID, NAME) %>%
#' mutate(mean_wealth = rnorm(nrow(us_states), 50, 10)) # Add mock data
#'
#' # Define a temporary shapefile path
#' shapefile_path <- tempfile(fileext = ".shp")
#' sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#' # Create spatial weights using the listw function from the package
#' us_states_listw <- DHSr::listw(
#' shapefile_path = shapefile_path,
#' data = us_states_data %>% sf::st_drop_geometry(), # Drop geometry for compatibility
#' loc_shape = "GEOID",
#' loc_data = "GEOID",
#' weight_function = function(d) exp(-d / 0.2)
#' )
#'
#' # Apply the Spdeplisa function
#' lisa_result <- DHSr::Spdeplisa(
#' data = us_states_data,
#' variable_name = "mean_wealth",
#' listw = us_states_listw
#' )
#'
#' # View the result
#' head(lisa_result)
#' }
#' @importFrom spdep localmoran
#' @importFrom dplyr mutate case_when
#' @export
Spdeplisa <- function(data, variable_name, listw) {
data[[variable_name]][is.na(data[[variable_name]])] <- 0
local_moran <- spdep::localmoran(data[[variable_name]], listw)
data$lisa_I <- local_moran[, 1]
data$lisa_p <- local_moran[, 5]
data$z_i <- (data[[variable_name]] - mean(data[[variable_name]], na.rm = TRUE)) / stats::sd(data[[variable_name]], na.rm = TRUE)
data <- dplyr::mutate(
data,
sign_combination2 = dplyr::case_when(
data$z_i > 0 & data$lisa_I < 0 ~ "positive-negative",
data$z_i < 0 & data$lisa_I < 0 ~ "negative-positive",
TRUE ~ "others"
),
sign_combination3 = dplyr::case_when(
data$z_i > 0 & data$lisa_I > 0 ~ "High-High",
data$z_i < 0 & data$lisa_I > 0 ~ "Low-Low",
TRUE ~ "others"
)
)
return(data)
}
# Declare global variables to suppress R CMD check notes
utils::globalVariables(c("cluster_id", "cluster_area"))
#' Create Cluster Map Based on Local Moran's I
#'
#' This function creates a map of clusters based on Local Moran's I values. It identifies
#' clusters using Queen contiguity and visualizes them on a map.
#'
#' @param dataset A spatial dataset of class `sf`.
#' @param lisa_value The name of the variable in the dataset containing Local Moran's I values.
#' @param lisa_label The name of the variable in the dataset containing the LISA label.
#' @param label The specific label to filter clusters.
#' @param lisa_cutoff A numeric value specifying the cutoff for LISA values.
#' @param location_var The variable name indicating the primary location in the dataset.
#' @param location_name The name of the variable for the location names.
#' @param level2 An optional second level of location hierarchy. Default is `NULL`.
#' @param id_start The starting value for cluster IDs. Default is `0`.
#' @param comparison The comparison operator for filtering (`>`, `<`, `>=`, etc.). Default is `>`.
#' @param min_area Minimum area required for a cluster to be considered valid. Default is `0`.
#' @param min_ Minimum number of districts required for a cluster to be valid. Default is `5`.
#' @param title The title of the map. Default is `"Clusters Based on Queen Contiguity"`.
#' @param subtitle A subtitle for the map. Default is `""`.
#' @param footnote A footnote for the map. Default is `""`.
#' @param legend_position The position of the legend on the map. Default is `"bottom"`.
#' @param color_scheme The color scheme for the map. Default is `"C"`.
#' @return A list with the following components:
#' \item{dataset_with_clusters}{An `sf` object containing the dataset with assigned cluster IDs.}
#' \item{summary_clusters}{A data frame summarizing cluster information, including regions, area, and the number of locations.}
#' \item{plot}{A `ggplot` object for visualizing the clusters.}
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#' library(sf)
#' library(dplyr)
#' library(spdep)
#' library(spData)
#' library(ggplot2)
#'
#' # Load US states data from spData
#' us_states <- spData::us_states
#'
#' # Simplify for demonstration: Select a subset of columns
#' us_states_data <- us_states %>%
#' select(GEOID, NAME) %>%
#' mutate(mean_wealth = rnorm(nrow(us_states), 50, 10)) # Add mock data
#'
#' # Define a shapefile path
#' shapefile_path <- tempfile(fileext = ".shp")
#' sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#' # Corrected listw function call using your package
#' us_states_listw <- DHSr::listw(
#' shapefile_path = shapefile_path,
#' data = us_states_data %>% sf::st_drop_geometry(), # Drop geometry for compatibility
#' loc_shape = "GEOID",
#' loc_data = "GEOID",
#' weight_function = function(d) exp(-d / 0.2)
#' )
#'
#' # Apply Spdeplisa function
#' lisa_result <- DHSr::Spdeplisa(
#' data = us_states_data,
#' variable_name = "mean_wealth",
#' listw = us_states_listw
#' )
#'
#' # Add LISA labels
#' lisa_result <- lisa_result %>%
#' mutate(lisa_label = case_when(
#' lisa_I > 0 ~ "High-High",
#' lisa_I < 0 ~ "Low-Low",
#' TRUE ~ "Others"
#' ))
#'
#' # Apply cluster_map function
#' cluster_map_result <- DHSr::cluster_map(
#' dataset = lisa_result,
#' lisa_value = "lisa_I",
#' lisa_label = "lisa_label",
#' label = "High-High",
#' lisa_cutoff = 0.5,
#' location_var = "GEOID",
#' location_name = "NAME",
#' id_start = 1,
#' comparison = ">",
#' min_area = 0,
#' min_ = 3, # Reduced for smaller demonstration
#' title = "Clusters Based on Queen Contiguity",
#' subtitle = "High-High Clusters",
#' footnote = "Generated using DHSr package",
#' legend_position = "bottom",
#' color_scheme = "C"
#' )
#'
#' # View the resulting dataset with clusters
#' head(cluster_map_result$dataset_with_clusters)
#'
#' # View the cluster summary
#' print(cluster_map_result$summary_clusters)
#'
#' # Plot the clusters
#' print(cluster_map_result$plot)
#' }
#' @importFrom dplyr filter sym select mutate group_by summarize ungroup distinct
#' @importFrom sf st_drop_geometry st_centroid st_geometry st_coordinates st_as_sf
#' @importFrom spdep poly2nb n.comp.nb mat2listw
#' @importFrom ggplot2 ggplot geom_sf aes scale_fill_manual labs theme_minimal theme element_rect
#' @importFrom viridis viridis_pal
#' @importFrom rlang parse_expr
#' @export
cluster_map <- function(dataset, lisa_value, lisa_label, label, lisa_cutoff, location_var, location_name, level2 = NULL, id_start = 0, comparison = ">", min_area = 0, min_ = 5,
title = "Clusters Based on Queen Contiguity", subtitle = "", footnote = "", legend_position = "bottom", color_scheme = "C") {
if (!inherits(dataset, "sf")) {
stop("The dataset must be an sf object")
}
filter_expr <- rlang::parse_expr(paste0("`", lisa_value, "`", comparison, lisa_cutoff))
filtered_dataset <- dplyr::filter(dataset, !!rlang::sym(lisa_label) == label & !!filter_expr) %>%
sf::st_as_sf()
queen_nb <- spdep::poly2nb(filtered_dataset)
components <- spdep::n.comp.nb(queen_nb)$comp.id
filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = components + id_start)
filtered_dataset <- dplyr::group_by(filtered_dataset, cluster_id) %>%
dplyr::mutate(cluster_area = sum(as.numeric(sf::st_area(sf::st_geometry(filtered_dataset))))) %>%
dplyr::ungroup()
filtered_dataset$cluster_area <- as.numeric(filtered_dataset$cluster_area)
cluster_sizes <- table(filtered_dataset$cluster_id)
valid_clusters <- names(cluster_sizes[cluster_sizes >= min_])
filtered_dataset <- dplyr::filter(filtered_dataset, cluster_id %in% valid_clusters & cluster_area >= min_area)
filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = as.factor(as.numeric(factor(cluster_id))))
if (!is.null(level2)) {
filtered_dataset_simple <- filtered_dataset %>%
sf::st_drop_geometry() %>%
dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), !!rlang::sym(level2), cluster_id, cluster_area) %>%
dplyr::distinct(!!rlang::sym(location_var), !!rlang::sym(level2), .keep_all = TRUE)
dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = c(location_var, level2), by.y = c(location_var, level2), all.x = TRUE)
} else {
filtered_dataset_simple <- filtered_dataset %>%
sf::st_drop_geometry() %>%
dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), cluster_id, cluster_area) %>%
dplyr::distinct(!!rlang::sym(location_var), .keep_all = TRUE)
dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = location_var, by.y = location_var, all.x = TRUE)
}
dataset_with_clusters <- sf::st_as_sf(dataset_with_clusters)
if (!is.null(level2)) {
summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
dplyr::summarize(
regions = paste(unique(paste(!!rlang::sym(location_name), "(", !!rlang::sym(level2), ")", sep = "")), collapse = ", "),
cluster_area = unique(cluster_area),
num_ = dplyr::n()
) %>%
dplyr::ungroup()
} else {
summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
dplyr::summarize(
regions = paste(unique(!!rlang::sym(location_name)), collapse = ", "),
cluster_area = unique(cluster_area),
num_ = dplyr::n()
) %>%
dplyr::ungroup()
}
unique_cluster_ids <- unique(dataset_with_clusters$cluster_id[!is.na(dataset_with_clusters$cluster_id)])
fill_colors <- c(stats::setNames(viridis::viridis_pal(option = color_scheme)(length(unique_cluster_ids)), unique_cluster_ids), "white")
plot <- ggplot2::ggplot(dataset_with_clusters) +
ggplot2::geom_sf(ggplot2::aes(fill = cluster_id), color = NA) +
ggplot2::scale_fill_manual(values = fill_colors, na.value = "white", name = "Cluster ID", drop = FALSE) +
ggplot2::labs(title = title, subtitle = subtitle, caption = footnote) +
ggplot2::theme_minimal() +
ggplot2::theme(legend.position = legend_position, panel.background = ggplot2::element_rect(fill = "lightgrey"), plot.background = ggplot2::element_rect(fill = "lightgrey"))
list(dataset_with_clusters = dataset_with_clusters, summary_clusters = summary_clusters, plot = plot)
}
utils::globalVariables(c("mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares", "lambda_d"))
#' Calculate Stein's Beta for Each Cluster
#'
#' This function calculates Stein's Beta for each cluster within the dataset.
#' It applies Stein's shrinkage estimator to the specified beta estimates within each cluster.
#'
#' @param data A dataframe containing the data.
#' @param cluster_id The name of the column representing the cluster IDs.
#' @param beta The name of the column representing the beta estimates.
#' @return A data frame containing the input data with additional columns:
#' \item{stein_beta}{The Stein-shrinkage adjusted beta values.}
#' \item{lambda_d}{Shrinkage factors for each cluster.}
#' \item{mu_beta_m}{Mean beta values for each cluster.}
#' \item{sigma_hat_sq}{Estimated variance of the beta values within clusters.}
#' \item{sum_of_squares}{Sum of squared deviations of beta values from their mean.}
#' @examples
#' # Create dummy data
#' library(dplyr)
#' set.seed(123)
#' dummy_data <- data.frame(
#' years_education = rnorm(100, 12, 3), # Represents years of education
#' gender_female = rbinom(100, 1, 0.5), # 1 = Female, 0 = Male
#' household_wealth = sample(1:5, 100, replace = TRUE), # Wealth index from 1 to 5
#' district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for all districts
#' results1 <- DHSr::Replm2(dummy_data, formula, "district_code", "normal")
#'
#' # Assign random clusters for demonstration
#' clusters <- data.frame(
#' district_code = unique(dummy_data$district_code),
#' cluster_id = sample(1:3, length(unique(dummy_data$district_code)), replace = TRUE)
#' )
#'
#' # Merge clusters with regression results
#' cluster_beta <- merge(clusters, results1, by.x = "district_code", by.y = "location")
#'
#' # Apply Stein Beta shrinkage
#' results_with_stein_beta <- DHSr::stein_beta(
#' data = cluster_beta,
#' cluster_id = "cluster_id", # Column for cluster IDs
#' beta = "estimate_gender_female" # Column for beta estimates
#' )
#'
#' # View results
#' print(head(results_with_stein_beta))
#' @importFrom dplyr filter mutate group_by summarize ungroup sym
#' @export
stein_beta <- function(data, cluster_id, beta) {
data <- dplyr::filter(data, !is.na(!!rlang::sym(cluster_id)) & !is.na(!!rlang::sym(beta)))
data <- dplyr::group_by(data, !!rlang::sym(cluster_id)) %>%
dplyr::mutate(
n_k = dplyr::n(),
mu_beta_m = mean(!!rlang::sym(beta)),
sigma_hat_sq = stats::var(!!rlang::sym(beta)),
sum_of_squares = sum((!!rlang::sym(beta) - mu_beta_m)^2),
lambda_d = max(0, 1 - (n_k - 2) * sigma_hat_sq / sum_of_squares),
stein_beta = lambda_d * mu_beta_m + (1 - lambda_d) * !!rlang::sym(beta)
) %>%
dplyr::ungroup()
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.