R/hello.R
In DHSr: Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

Documented in cluster_map listw Repglmre2 Replm2 Replmre2 single_glmre2 single_lm2 single_lmre2 Spdeplisa stein_beta

.onAttach <- function(libname, pkgname) {
  required_packages <- c("dplyr", "ggplot2", "rlang", "sf", "spdep", "viridis", "nlme", "MuMIn", "tidyr", "spData")

  missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]

  if (length(missing_packages) > 0) {
    packageStartupMessage("The following packages are missing and should be installed: ", paste(missing_packages, collapse = ", "))
  }

  packageStartupMessage("DHSr package loaded with all dependencies.")
}

#' Linear Regression Analysis for Specified Location
#'
#' This function runs a linear regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @param location_index The specific location index or number for which the model should be run.
#' @return A dataframe containing the results for the specified location.
#' @examples
#' set.seed(123)
#' if (requireNamespace("dplyr", quietly = TRUE)) {
#'   library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for a specific location (e.g., district 1)
#' result_single_lm <- single_lm2(dummy_data, formula, "district_code",
#'  response_distribution = "normal", location_index = 1)
#'
#' # View the result
#' print(result_single_lm)
#' }
#' @importFrom stats as.formula glm lm na.omit
#' @importFrom sf st_drop_geometry
#' @importFrom dplyr filter sym select all_of
#' @importFrom dplyr %>%
#' @importFrom stats sd setNames
#' @export
single_lm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL, location_index) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    if (response_distribution == "normal") {
      stats::lm(formula, data = subset_data)
    } else if (response_distribution == "other" && !is.null(family)) {
      stats::glm(formula, family = family, data = subset_data)
    } else {
      stop("Unsupported response distribution or missing family parameter for GLM.")
    }
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$coefficients
  coefs <- coef_summary[, "Estimate"]
  se <- coef_summary[, "Std. Error"]

  if (response_distribution == "normal") {
    p_values <- coef_summary[, "Pr(>|t|)"]
    r_squared <- summary(model)$r.squared
    r_squared_df <- data.frame(
      term = "R-squared",
      estimate = r_squared,
      std_error = NA,
      p_value = NA,
      stringsAsFactors = FALSE
    )
  } else {
    p_values <- coef_summary[, "Pr(>|z|)"]
    r_squared_df <- NULL
  }

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  if (!is.null(r_squared_df)) {
    results <- rbind(results, r_squared_df)
  }

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Run Regression Analysis for All Locations
#'
#' This function runs a regression model for all unique locations within a dataset and combines the results.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param response_distribution The distribution of the response variable ("normal" for normal distribution, "other" for other distributions).
#' @param family The family to be used for GLM if response_distribution is "other" (e.g., `binomial` for logistic regression).
#' @return A dataframe containing the combined results for all locations.
#' @examples
#' set.seed(123)
#' library(dplyr)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE) # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression across all locations (districts)
#' results1 <- Replm2(dummy_data, formula, "district_code", "normal")
#' print(results1)
#' @importFrom tidyr pivot_wider
#' @export
Replm2 <- function(data, formula, location_var, response_distribution = "normal", family = NULL) {
  location_indices <- unique(data[[location_var]])

  all_results <- list()

  for (location_index in location_indices) {
    result <- tryCatch({
      single_lm2(data, formula, location_var, response_distribution, family, location_index)
    }, error = function(e) {
      message(paste("Error processing location index", location_index, ":", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results[[as.character(location_index)]] <- result
    }
  }

  combined_results <- do.call(rbind, all_results)

  reshaped_results <- tidyr::pivot_wider(
    combined_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )

  return(reshaped_results)
}

#' Mixed-Effects Regression Analysis for Specified Location
#'
#' This function runs a mixed-effects regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @return the results for the specified location to test
#' @examples
#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects regression for a specific location (e.g., district 1)
#' result_single_lmre <- single_lmre2(dummy_data, formula, location_var,
#' random_effect_var, location_index = 1)
#'
#' # View the result
#' print(result_single_lmre)
#' @importFrom nlme lme
#' @importFrom MuMIn r.squaredGLMM
#' @export
single_lmre2 <- function(data, formula, location_var, random_effect_var, location_index) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  relevant_vars <- c(relevant_vars, random_effect_var)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    nlme::lme(as.formula(formula), random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data)
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$tTable
  coefs <- coef_summary[, "Value"]
  se <- coef_summary[, "Std.Error"]
  p_values <- coef_summary[, "p-value"]

  r_squared <- MuMIn::r.squaredGLMM(model)

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  r_squared_df <- data.frame(
    term = c("Marginal R-squared", "Conditional R-squared"),
    estimate = c(r_squared[1], r_squared[2]),
    std_error = c(NA, NA),
    p_value = c(NA, NA),
    stringsAsFactors = FALSE
  )
  results <- rbind(results, r_squared_df)

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Mixed-Effects Regression Analysis for All Locations
#'
#' This function runs a mixed-effects regression model for all locations within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @return A dataframe containing the results
#' @examples

#' set.seed(123)
#' library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Define a simple regression formula
#' formula <- years_education ~ gender_female + household_wealth:gender_female
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run mixed-effects regression for all districts
#' results <- DHSr::Replmre2(dummy_data, formula, location_var, random_effect_var)
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Replmre2 <- function(data, formula, location_var, random_effect_var) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  location_indices <- unique(data[[location_var]])
  all_results <- data.frame()

  for (location_index in location_indices) {
    result <- tryCatch({
      DHSr::single_lmre2(data, formula, location_var, random_effect_var, location_index)
    }, error = function(e) {
      message(paste("Error processing location index:", location_index, "-", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results <- rbind(all_results, result)
    }
  }

  reshaped_results <- tidyr::pivot_wider(
    all_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )


  return(reshaped_results)
}

#' Mixed-Effects Logistic Regression Analysis for a Specified Location
#'
#' This function runs a mixed-effects logistic regression model for a specified location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param location_index The specific location index or number for which the model should be run.
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression).
#' @return The results for sigle location to test
#' @examples
#' set.seed(123)
#'   library(dplyr)
#' # Create dummy data
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' # Set the location and random effect variables
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the mixed-effects logistic regression for a specific location (e.g., district 1)
#' result_single_glmre <- single_glmre2(dummy_data, formula, location_var, random_effect_var,
#'  location_index = 1, family = binomial())
#'
#' # View the result
#' print(result_single_glmre)
#' @importFrom nlme lme
#' @export
single_glmre2 <- function(data, formula, location_var, random_effect_var, location_index, family = NULL) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  subset_data <- dplyr::filter(data, !!rlang::sym(location_var) == location_index)

  if ("geometry" %in% colnames(subset_data)) {
    subset_data <- sf::st_drop_geometry(subset_data)
  }

  relevant_vars <- all.vars(formula)
  relevant_vars <- c(relevant_vars, random_effect_var)
  subset_data <- dplyr::select(subset_data, dplyr::all_of(relevant_vars))

  subset_data <- stats::na.omit(subset_data)

  subset_data[[random_effect_var]] <- as.factor(subset_data[[random_effect_var]])

  var_lengths <- sapply(subset_data, length)
  if (any(var_lengths != var_lengths[1])) {
    stop("Variable lengths differ in the subset data.")
  }

  model <- tryCatch({
    nlme::lme(fixed = formula, random = as.formula(paste0("~ 1 | ", random_effect_var)), data = subset_data, method = "REML")
  }, error = function(e) {
    stop(paste("Error fitting model:", e$message))
  })

  coef_summary <- summary(model)$tTable
  coefs <- coef_summary[, "Value"]
  se <- coef_summary[, "Std.Error"]
  p_values <- coef_summary[, "p-value"]

  r_squared <- MuMIn::r.squaredGLMM(model)

  results <- data.frame(
    term = rownames(coef_summary),
    estimate = coefs,
    std_error = se,
    p_value = p_values,
    stringsAsFactors = FALSE
  )

  r_squared_df <- data.frame(
    term = c("Marginal R-squared", "Conditional R-squared"),
    estimate = c(r_squared[1], r_squared[2]),
    std_error = c(NA, NA),
    p_value = c(NA, NA),
    stringsAsFactors = FALSE
  )

  results <- rbind(results, r_squared_df)

  results$location <- location_index

  return(results)
}

utils::globalVariables(c("term", "estimate", "std_error", "p_value", "cluster_id", "cluster_area", "REGCODE", "lambda_d", "mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares"))

#' Loop through all locations and run GLMM for each
#'
#' This function runs a mixed-effects generalized linear model (GLMM) for each location within a dataset.
#'
#' @param data The dataset to be analyzed.
#' @param formula The formula for the regression model.
#' @param location_var The variable indicating different locations (e.g., `REGCODE`).
#' @param random_effect_var The variable to be used as a random effect (e.g., `hhid`).
#' @param family The family to be used for GLM (e.g., `binomial` for logistic regression, `poisson` for Poisson regression).
#' @return A dataframe containing the results
#' @examples

#' set.seed(123)
#'
#' # Create dummy data
#'   library(dplyr)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Create HHid (Household ID), grouping every 3-4 records, and convert to character
#' dummy_data$HHid <- as.character(rep(1:20, each = 5, length.out = nrow(dummy_data)))
#'
#' # Create a binary outcome variable for years of education
#' dummy_data$education_binary <- ifelse(dummy_data$years_education > 11, 1, 0)
#'
#' # Define a logistic regression formula
#' formula <- education_binary ~ gender_female + household_wealth:gender_female
#'
#' location_var <- "district_code"
#' random_effect_var <- "HHid"
#'
#' # Run the logistic mixed-effects model across all locations (districts)
#' results <- DHSr::Repglmre2(data = dummy_data, formula = formula,
#'                            location_var = location_var, random_effect_var = random_effect_var,
#'                            family = binomial())
#'
#' # Print the results
#' print(head(results))
#' @importFrom tidyr pivot_wider
#' @export
Repglmre2 <- function(data, formula, location_var, random_effect_var, family) {
  if ("geometry" %in% colnames(data)) {
    data <- sf::st_drop_geometry(data)
  }

  location_indices <- unique(data[[location_var]])
  all_results <- data.frame()

  for (location_index in location_indices) {
    result <- tryCatch({
      DHSr::single_glmre2(data, formula, location_var, random_effect_var, location_index, family)
    }, error = function(e) {
      message(paste("Error processing location index:", location_index, "-", e$message))
      return(NULL)
    })

    if (!is.null(result)) {
      all_results <- rbind(all_results, result)
    }
  }

  reshaped_results <- tidyr::pivot_wider(
    all_results,
    names_from = term,
    values_from = c(estimate, std_error, p_value),
    names_sep = "_"
  )

  return(reshaped_results)
}
#' Create Spatial Weights List
#'
#' This function creates a spatial weights list using a shapefile and a dataset.
#'
#' @param shapefile_path A string specifying the file path to the shapefile.
#' @param data A dataframe containing the variables to be analyzed.
#' @param loc_shape A string specifying the column name in the shapefile used for merging.
#' @param loc_data A string specifying the column name in the dataset that corresponds to the location variable.
#' @param weight_function A function to calculate weights from distances. Defaults to `function(d) exp(-d / 0.2)`.
#' @return A spatial weights list object of class `listw`.
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#'     library(dplyr)
#'     library(sf)
#'
#'     # Load US states data
#'     us_states <- spData::us_states
#'
#'     # Simplify for demonstration: Select a subset of columns
#'     us_states_data <- us_states %>%
#'         select(GEOID, NAME) %>%
#'         mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'     # Define a temporary shapefile path
#'     shapefile_path <- tempfile(fileext = ".shp")
#'     sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'     # Use the listw function from the package
#'     us_states_listw <- DHSr::listw(
#'         shapefile_path = shapefile_path,
#'         data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'         loc_shape = "GEOID",
#'         loc_data = "GEOID",
#'         weight_function = function(d) exp(-d / 0.2)
#'     )
#'
#'     # Verify the spatial weights list
#'     print(us_states_listw)
#' }
#' @importFrom sf st_read st_centroid st_geometry st_coordinates
#' @importFrom dplyr distinct
#' @importFrom stats dist
#' @importFrom spdep mat2listw
#' @export
listw <- function(shapefile_path, data, loc_shape, loc_data, weight_function = function(d) exp(-d / 0.2)) {
  shape_data <- sf::st_read(shapefile_path)

  # Check if loc_shape exists in the data, if not, create it from loc_data
  if (!(loc_shape %in% names(data))) {
    data[[loc_shape]] <- data[[loc_data]]
  }

  shape_data <- dplyr::distinct(shape_data, !!sym(loc_shape), .keep_all = TRUE)

  merged_data <- merge(shape_data, data, by = loc_shape, all.y = TRUE)

  coords <- sf::st_centroid(sf::st_geometry(merged_data))
  coords <- sf::st_coordinates(coords)
  distances <- as.matrix(stats::dist(coords))

  weights <- weight_function(distances)

  diag(weights) <- 0

  listw <- spdep::mat2listw(weights, style = "W")

  return(listw)
}

#' Calculate Local Moran's I and Sign Combination Variables
#'
#' This function calculates Local Moran's I for a specified variable in a dataset
#' and creates sign combination variables based on the standardized variable
#' and the local Moran's I values.
#'
#' @param data A dataframe containing the spatial data.
#' @param variable_name A string representing the name of the variable to be analyzed.
#' @param listw A listw object containing spatial weights for the dataset.
#' @return A data frame containing the original data with additional columns:
#' \item{lisa_I}{Local Moran's I values for the specified variable.}
#' \item{lisa_p}{P-values corresponding to the Local Moran's I values.}
#' \item{z_i}{Standardized values of the input variable.}
#' \item{sign_combination2}{Categories based on the sign of z_i and lisa_I (e.g., "positive-negative").}
#' \item{sign_combination3}{Categories based on the sign of z_i and lisa_I (e.g., "High-High").}
#' @examples
#' # Load necessary libraries
#' if (requireNamespace("spData", quietly = TRUE)) {
#'   library(spData)
#'   library(sf)
#'   library(dplyr)
#'
#'   # Use US states data as a substitute for a shapefile
#'   us_states <- spData::us_states
#'
#'   # Simplify for demonstration: Select a subset of columns
#'   us_states_data <- us_states %>%
#'     select(GEOID, NAME) %>%
#'     mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'   # Define a temporary shapefile path
#'   shapefile_path <- tempfile(fileext = ".shp")
#'   sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'   # Create spatial weights using the listw function from the package
#'   us_states_listw <- DHSr::listw(
#'     shapefile_path = shapefile_path,
#'     data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'     loc_shape = "GEOID",
#'     loc_data = "GEOID",
#'     weight_function = function(d) exp(-d / 0.2)
#'   )
#'
#'   # Apply the Spdeplisa function
#'   lisa_result <- DHSr::Spdeplisa(
#'     data = us_states_data,
#'     variable_name = "mean_wealth",
#'     listw = us_states_listw
#'   )
#'
#'   # View the result
#'   head(lisa_result)
#' }
#' @importFrom spdep localmoran
#' @importFrom dplyr mutate case_when
#' @export
Spdeplisa <- function(data, variable_name, listw) {
  data[[variable_name]][is.na(data[[variable_name]])] <- 0

  local_moran <- spdep::localmoran(data[[variable_name]], listw)

  data$lisa_I <- local_moran[, 1]
  data$lisa_p <- local_moran[, 5]

  data$z_i <- (data[[variable_name]] - mean(data[[variable_name]], na.rm = TRUE)) / stats::sd(data[[variable_name]], na.rm = TRUE)

  data <- dplyr::mutate(
    data,
    sign_combination2 = dplyr::case_when(
      data$z_i > 0 & data$lisa_I < 0 ~ "positive-negative",
      data$z_i < 0 & data$lisa_I < 0 ~ "negative-positive",
      TRUE ~ "others"
    ),
    sign_combination3 = dplyr::case_when(
      data$z_i > 0 & data$lisa_I > 0 ~ "High-High",
      data$z_i < 0 & data$lisa_I > 0 ~ "Low-Low",
      TRUE ~ "others"
    )
  )

  return(data)
}

# Declare global variables to suppress R CMD check notes
utils::globalVariables(c("cluster_id", "cluster_area"))

#' Create Cluster Map Based on Local Moran's I
#'
#' This function creates a map of clusters based on Local Moran's I values. It identifies
#' clusters using Queen contiguity and visualizes them on a map.
#'
#' @param dataset A spatial dataset of class `sf`.
#' @param lisa_value The name of the variable in the dataset containing Local Moran's I values.
#' @param lisa_label The name of the variable in the dataset containing the LISA label.
#' @param label The specific label to filter clusters.
#' @param lisa_cutoff A numeric value specifying the cutoff for LISA values.
#' @param location_var The variable name indicating the primary location in the dataset.
#' @param location_name The name of the variable for the location names.
#' @param level2 An optional second level of location hierarchy. Default is `NULL`.
#' @param id_start The starting value for cluster IDs. Default is `0`.
#' @param comparison The comparison operator for filtering (`>`, `<`, `>=`, etc.). Default is `>`.
#' @param min_area Minimum area required for a cluster to be considered valid. Default is `0`.
#' @param min_ Minimum number of districts required for a cluster to be valid. Default is `5`.
#' @param title The title of the map. Default is `"Clusters Based on Queen Contiguity"`.
#' @param subtitle A subtitle for the map. Default is `""`.
#' @param footnote A footnote for the map. Default is `""`.
#' @param legend_position The position of the legend on the map. Default is `"bottom"`.
#' @param color_scheme The color scheme for the map. Default is `"C"`.
#' @return A list with the following components:
#' \item{dataset_with_clusters}{An `sf` object containing the dataset with assigned cluster IDs.}
#' \item{summary_clusters}{A data frame summarizing cluster information, including regions, area, and the number of locations.}
#' \item{plot}{A `ggplot` object for visualizing the clusters.}
#' @examples
#' if (requireNamespace("spData", quietly = TRUE)) {
#'   library(sf)
#'   library(dplyr)
#'   library(spdep)
#'   library(spData)
#'   library(ggplot2)
#'
#'   # Load US states data from spData
#'   us_states <- spData::us_states
#'
#'   # Simplify for demonstration: Select a subset of columns
#'   us_states_data <- us_states %>%
#'     select(GEOID, NAME) %>%
#'     mutate(mean_wealth = rnorm(nrow(us_states), 50, 10))  # Add mock data
#'
#'   # Define a shapefile path
#'   shapefile_path <- tempfile(fileext = ".shp")
#'   sf::st_write(us_states, shapefile_path, quiet = TRUE)
#'
#'   # Corrected listw function call using your package
#'   us_states_listw <- DHSr::listw(
#'     shapefile_path = shapefile_path,
#'     data = us_states_data %>% sf::st_drop_geometry(),  # Drop geometry for compatibility
#'     loc_shape = "GEOID",
#'     loc_data = "GEOID",
#'     weight_function = function(d) exp(-d / 0.2)
#'   )
#'
#'   # Apply Spdeplisa function
#'   lisa_result <- DHSr::Spdeplisa(
#'     data = us_states_data,
#'     variable_name = "mean_wealth",
#'     listw = us_states_listw
#'   )
#'
#'   # Add LISA labels
#'   lisa_result <- lisa_result %>%
#'     mutate(lisa_label = case_when(
#'       lisa_I > 0 ~ "High-High",
#'       lisa_I < 0 ~ "Low-Low",
#'       TRUE ~ "Others"
#'     ))
#'
#'   # Apply cluster_map function
#'   cluster_map_result <- DHSr::cluster_map(
#'     dataset = lisa_result,
#'     lisa_value = "lisa_I",
#'     lisa_label = "lisa_label",
#'     label = "High-High",
#'     lisa_cutoff = 0.5,
#'     location_var = "GEOID",
#'     location_name = "NAME",
#'     id_start = 1,
#'     comparison = ">",
#'     min_area = 0,
#'     min_ = 3,  # Reduced for smaller demonstration
#'     title = "Clusters Based on Queen Contiguity",
#'     subtitle = "High-High Clusters",
#'     footnote = "Generated using DHSr package",
#'     legend_position = "bottom",
#'     color_scheme = "C"
#'   )
#'
#'   # View the resulting dataset with clusters
#'   head(cluster_map_result$dataset_with_clusters)
#'
#'   # View the cluster summary
#'   print(cluster_map_result$summary_clusters)
#'
#'   # Plot the clusters
#'   print(cluster_map_result$plot)
#' }
#' @importFrom dplyr filter sym select mutate group_by summarize ungroup distinct
#' @importFrom sf st_drop_geometry st_centroid st_geometry st_coordinates st_as_sf
#' @importFrom spdep poly2nb n.comp.nb mat2listw
#' @importFrom ggplot2 ggplot geom_sf aes scale_fill_manual labs theme_minimal theme element_rect
#' @importFrom viridis viridis_pal
#' @importFrom rlang parse_expr
#' @export
cluster_map <- function(dataset, lisa_value, lisa_label, label, lisa_cutoff, location_var, location_name, level2 = NULL, id_start = 0, comparison = ">", min_area = 0, min_ = 5,
                        title = "Clusters Based on Queen Contiguity", subtitle = "", footnote = "", legend_position = "bottom", color_scheme = "C") {
  if (!inherits(dataset, "sf")) {
    stop("The dataset must be an sf object")
  }

  filter_expr <- rlang::parse_expr(paste0("`", lisa_value, "`", comparison, lisa_cutoff))

  filtered_dataset <- dplyr::filter(dataset, !!rlang::sym(lisa_label) == label & !!filter_expr) %>%
    sf::st_as_sf()

  queen_nb <- spdep::poly2nb(filtered_dataset)

  components <- spdep::n.comp.nb(queen_nb)$comp.id

  filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = components + id_start)

  filtered_dataset <- dplyr::group_by(filtered_dataset, cluster_id) %>%
    dplyr::mutate(cluster_area = sum(as.numeric(sf::st_area(sf::st_geometry(filtered_dataset))))) %>%
    dplyr::ungroup()

  filtered_dataset$cluster_area <- as.numeric(filtered_dataset$cluster_area)

  cluster_sizes <- table(filtered_dataset$cluster_id)
  valid_clusters <- names(cluster_sizes[cluster_sizes >= min_])
  filtered_dataset <- dplyr::filter(filtered_dataset, cluster_id %in% valid_clusters & cluster_area >= min_area)

  filtered_dataset <- dplyr::mutate(filtered_dataset, cluster_id = as.factor(as.numeric(factor(cluster_id))))

  if (!is.null(level2)) {
    filtered_dataset_simple <- filtered_dataset %>%
      sf::st_drop_geometry() %>%
      dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), !!rlang::sym(level2), cluster_id, cluster_area) %>%
      dplyr::distinct(!!rlang::sym(location_var), !!rlang::sym(level2), .keep_all = TRUE)

    dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = c(location_var, level2), by.y = c(location_var, level2), all.x = TRUE)
  } else {
    filtered_dataset_simple <- filtered_dataset %>%
      sf::st_drop_geometry() %>%
      dplyr::select(!!rlang::sym(location_var), !!rlang::sym(location_name), cluster_id, cluster_area) %>%
      dplyr::distinct(!!rlang::sym(location_var), .keep_all = TRUE)

    dataset_with_clusters <- merge(dataset, filtered_dataset_simple, by.x = location_var, by.y = location_var, all.x = TRUE)
  }

  dataset_with_clusters <- sf::st_as_sf(dataset_with_clusters)

  if (!is.null(level2)) {
    summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
      dplyr::summarize(
        regions = paste(unique(paste(!!rlang::sym(location_name), "(", !!rlang::sym(level2), ")", sep = "")), collapse = ", "),
        cluster_area = unique(cluster_area),
        num_ = dplyr::n()
      ) %>%
      dplyr::ungroup()
  } else {
    summary_clusters <- dplyr::group_by(filtered_dataset, cluster_id) %>%
      dplyr::summarize(
        regions = paste(unique(!!rlang::sym(location_name)), collapse = ", "),
        cluster_area = unique(cluster_area),
        num_ = dplyr::n()
      ) %>%
      dplyr::ungroup()
  }

  unique_cluster_ids <- unique(dataset_with_clusters$cluster_id[!is.na(dataset_with_clusters$cluster_id)])
  fill_colors <- c(stats::setNames(viridis::viridis_pal(option = color_scheme)(length(unique_cluster_ids)), unique_cluster_ids), "white")

  plot <- ggplot2::ggplot(dataset_with_clusters) +
    ggplot2::geom_sf(ggplot2::aes(fill = cluster_id), color = NA) +
    ggplot2::scale_fill_manual(values = fill_colors, na.value = "white", name = "Cluster ID", drop = FALSE) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = footnote) +
    ggplot2::theme_minimal() +
    ggplot2::theme(legend.position = legend_position, panel.background = ggplot2::element_rect(fill = "lightgrey"), plot.background = ggplot2::element_rect(fill = "lightgrey"))

  list(dataset_with_clusters = dataset_with_clusters, summary_clusters = summary_clusters, plot = plot)
}

utils::globalVariables(c("mu_beta_m", "n_k", "sigma_hat_sq", "sum_of_squares", "lambda_d"))

#' Calculate Stein's Beta for Each Cluster
#'
#' This function calculates Stein's Beta for each cluster within the dataset.
#' It applies Stein's shrinkage estimator to the specified beta estimates within each cluster.
#'
#' @param data A dataframe containing the data.
#' @param cluster_id The name of the column representing the cluster IDs.
#' @param beta The name of the column representing the beta estimates.
#' @return A data frame containing the input data with additional columns:
#' \item{stein_beta}{The Stein-shrinkage adjusted beta values.}
#' \item{lambda_d}{Shrinkage factors for each cluster.}
#' \item{mu_beta_m}{Mean beta values for each cluster.}
#' \item{sigma_hat_sq}{Estimated variance of the beta values within clusters.}
#' \item{sum_of_squares}{Sum of squared deviations of beta values from their mean.}
#' @examples
#' # Create dummy data
#' library(dplyr)
#' set.seed(123)
#' dummy_data <- data.frame(
#'   years_education = rnorm(100, 12, 3),    # Represents years of education
#'   gender_female = rbinom(100, 1, 0.5),    # 1 = Female, 0 = Male
#'   household_wealth = sample(1:5, 100, replace = TRUE),  # Wealth index from 1 to 5
#'   district_code = sample(1:10, 100, replace = TRUE)     # Represents district codes
#' ) %>% arrange(district_code)
#'
#' # Define a regression formula
#' formula <- years_education ~ gender_female + household_wealth + household_wealth:gender_female
#'
#' # Run the regression for all districts
#' results1 <- DHSr::Replm2(dummy_data, formula, "district_code", "normal")
#'
#' # Assign random clusters for demonstration
#' clusters <- data.frame(
#'   district_code = unique(dummy_data$district_code),
#'   cluster_id = sample(1:3, length(unique(dummy_data$district_code)), replace = TRUE)
#' )
#'
#' # Merge clusters with regression results
#' cluster_beta <- merge(clusters, results1, by.x = "district_code", by.y = "location")
#'
#' # Apply Stein Beta shrinkage
#' results_with_stein_beta <- DHSr::stein_beta(
#'   data = cluster_beta,
#'   cluster_id = "cluster_id",                # Column for cluster IDs
#'   beta = "estimate_gender_female"          # Column for beta estimates
#' )
#'
#' # View results
#' print(head(results_with_stein_beta))
#' @importFrom dplyr filter mutate group_by summarize ungroup sym
#' @export
stein_beta <- function(data, cluster_id, beta) {
  data <- dplyr::filter(data, !is.na(!!rlang::sym(cluster_id)) & !is.na(!!rlang::sym(beta)))

  data <- dplyr::group_by(data, !!rlang::sym(cluster_id)) %>%
    dplyr::mutate(
      n_k = dplyr::n(),
      mu_beta_m = mean(!!rlang::sym(beta)),
      sigma_hat_sq = stats::var(!!rlang::sym(beta)),
      sum_of_squares = sum((!!rlang::sym(beta) - mu_beta_m)^2),
      lambda_d = max(0, 1 - (n_k - 2) * sigma_hat_sq / sum_of_squares),
      stein_beta = lambda_d * mu_beta_m + (1 - lambda_d) * !!rlang::sym(beta)
    ) %>%
    dplyr::ungroup()

  return(data)
}
Any scripts or data that you put into this service are public.
DHSr documentation built on April 4, 2025, 12:18 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DHSr
Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

R/hello.R
In DHSr: Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

Defines functions stein_beta cluster_map Spdeplisa listw Repglmre2 single_glmre2 Replmre2 single_lmre2 Replm2 single_lm2 .onAttach

Documented in cluster_map listw Repglmre2 Replm2 Replmre2 single_glmre2 single_lm2 single_lmre2 Spdeplisa stein_beta

Try the DHSr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DHSr Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

R/hello.R In DHSr: Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

Defines functions stein_beta cluster_map Spdeplisa listw Repglmre2 single_glmre2 Replmre2 single_lmre2 Replm2 single_lm2 .onAttach

Documented in cluster_map listw Repglmre2 Replm2 Replmre2 single_glmre2 single_lm2 single_lmre2 Spdeplisa stein_beta

Try the DHSr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DHSr
Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly

R/hello.R
In DHSr: Create Large Scale Repeated Regression Summary Statistics Dataset and Visualization Seamlessly