#' Generate HSAs via Delamater's two step clustering approach
#'
#' @param shp shapefile SpatialPolygonsDataFrame
#' @param from patients home region
#' @param to patients destination
#' @param li_threshold localization index threshold
#' @param min_interventions minimum number of interventions for HSAs to have
#' @param min_clusters minimum number of HSAs in final solution
#' @param max_clusters maximum number of HSAs in final solution
#' @param fill logical; fill in sources without any patients based on neighbours (defaults to TRUE) see details
#' @details Rownames of \code{shp} should be in common with \code{from} and \code{to}. If this is not the case, warnings (if some are missing) or errors will be issued (if no names are in common). Rownames can be set via, e.g., \code{spChFIDs(shp, shp@data$name)}.
#' If \code{fill} is set to \code{TRUE}, observations from neighbouring regions are added to \code{from} to enable the missing region to be allocated to a HSA. The observations are not used in calculating the LI for the heuristic selection of the appropriate number of HSAs though.
#' @return object of class hsa containing the following elements:
#' \describe{
#' \item{\code{call}}{call to \code{gen_hsa}}
#' \item{\code{lookup}}{final lookup table}
#' \item{\code{original_data}}{list containing from and to originally submitted
#'
#' \code{from}: \code{from} as submitted to \code{gen_hsa}
#'
#' \code{to}: \code{to} as submitted to \code{gen_hsa}}
#' \item{\code{reassigned_data}}{list containing from and to as assigned to HSAs
#'
#' \code{from}: \code{from} converted to its HSA
#'
#' \code{to}: \code{to} converted to its HSA}
#' \item{\code{method}}{method used to assign each region to its HSA}
#' \item{\code{original_shp}}{shapefile submitted to \code{gen_hsa} (via \code{shp})}
#' \item{\code{shp}}{shapefile containing HSAs}
#' \item{\code{iterations}}{number of iterations}
#' \item{\code{li}}{localization index of the HSAs}
#' \item{\code{li_threshold}}{\code{li_threshold} option}
#' \item{\code{min_interventions}}{\code{min_interventions} option}
#' \item{\code{n}}{number of HSAs created}
#' \item{\code{n_it}}{number of HSAs in each iteration}
#' \item{\code{n_interventions}}{number of interventions in each HSA after the final iteration}
#' \item{\code{names}}{name of each HSA}
#' \item{\code{flow}}{flow amongst HSAs (as generated by \code{flows})}
#'
#' }
#' @export
#' @examples
gen_hsa_clust <- function(shp, from, to,
li_threshold = 0.4,
min_interventions = 10,
fill = TRUE,
min_clusters = 2,
max_clusters = NULL,
peaks = "dela"){
# unadulterated from and to variables
ofrom <- from
oto <- to
# warnings - check plausibility of regions ----
m <- rownames(shp@data) %in% from
if(FALSE %in% m){
warning(paste(sum(!m), "regions in 'shp' not in 'from'."), immediate. = TRUE, call. = FALSE)
if(sum(!m) == length(m)) stop("No regions in 'shp' in 'from'. Set spChFIDs.")
if(fill){
# get regions without events
empty <- which(!rownames(shp@data) %in% from)
touchmat <- gIntersects(shp, byid = TRUE, returnDense = FALSE)
touchmat[empty]
for(i in 1:length(empty)){
cat(paste("\nFilling", rownames(shp@data)[empty[i]]))
neighbours <- touchmat[[empty[i]]]
neighbours <- rownames(shp@data)[neighbours]
nfrom <- from[from %in% neighbours]
nto <- to[from %in% neighbours]
nfrom <- rep(rownames(shp@data)[empty[i]], length(nfrom))
from <- c(from, nfrom)
to <- c(to, nto)
cat(paste(" with", length(nfrom), "observations"))
if(length(nfrom) == 0) warning("Region has no neighbours with observations", immediate. = TRUE, call. = FALSE)
rm(nfrom, nto, neighbours)
}
cat("\n")
}
}
m <- rownames(shp@data) %in% to
if(FALSE %in% m){
warning(paste(sum(!m), "regions in 'shp' not in 'to'. \n"), immediate. = TRUE, call. = FALSE)
if(sum(!m) == length(m)) stop("No regions in 'shp' in 'to'. Set spChFIDs.")
}
m <- from %in% rownames(shp@data)
if(FALSE %in% m){
warning(paste(sum(!m), "regions in 'from' not in 'shp'. \n"), immediate. = TRUE, call. = FALSE)
if(sum(!m) == length(m)) stop("No regions in 'from' in 'shp'. Set spChFIDs.")
}
m <- to %in% rownames(shp@data)
if(FALSE %in% m){
warning(paste(sum(!m), "regions in 'to' not in 'shp'. \n"), immediate. = TRUE, call. = FALSE)
if(sum(!m) == length(m)) stop("No regions in 'to' in 'shp'. Set spChFIDs.")
}
rm(m)
flow <- flows(from, to)
flowr <- reshape(flow[, c("from", "to", "N")], direction = "wide", v.names = "N", idvar = "from", timevar = "to")
p.sum <- flowr
## Convert raw patient days to proportions (Commitment Index)
# Define variable for last zip code column
n.zip <- ncol(p.sum)
# Sum patient days for each hospital
# Assumes HOSP_ID is first column
hosp.pat <- rowSums(p.sum[,2:n.zip])
# Divide each column by total patient days
p.sum[,2:n.zip] <- p.sum[,2:n.zip] / hosp.pat
# Rename table
p.CI <- p.sum
rm(p.sum)
## Remove hospitals with no patient visits
# Locate hospitals with zero visits
zero.pv <- which(hosp.pat == 0)
# Get names of zero hospitals (will need this later!)
zero.names <- as.character(p.CI$HOSP_ID[zero.pv])
#% Note: p.CI.3yr is now an n x z+1 matrix of CI values.
#% The "+1" includes the identifier column (HOSP_ID).
# Remove hospitals from CI matrix
if(length(zero.pv) > 0) p.CI <- p.CI[-c(zero.pv),]
## Read travel distance data from shapefile
co <- coordinates(shp)
d <- as.data.frame(as.matrix(dist(co, upper = TRUE)))
d <- d[rownames(d) %in% as.character(p.CI[,1]),]
dist.mat <- d
# Convert table to OD matrix
## Scale table to match CI data range (01)
# Get maximum distance between hospitals
max <- max(dist.mat)
# Rescale data
dist.mat <- dist.mat/max
#% Join distance data matrix to CI data matrix
#% Note: To create the final n x m data matrix used for
#% clustering, the n x z and n x n matrix are joined.
# Add column for table join
dist.mat$HOSP_ID <- row.names(dist.mat)
names(p.CI)[1] <- "HOSP_ID"
# Join tables, add distance matrix to CI data
p.CI <- merge(p.CI, dist.mat, by="HOSP_ID")
## Custom 2step Kmeans + Ward's clustering function
#% Note: The inputs for the function are the n x m data
#% matrix (x) and the desired number of clusters (clusters).
kmeans.ward <- function(x, clusters) {
# Create distance matrix
d <- dist(x, "euclidean")
# Perform Ward's clustering
hc <- hclust(d, method="ward.D")
# Get cluster members at "K" clusters
memb <- cutree(hc, k = clusters)
# Make empty holder for cluster center locations
cent <- NULL
# Get cluster centers
for (k in 1:clusters) {
cent <- rbind(cent, colMeans(x[memb == k,]))
}
# Use cluster centers from Ward's to seed Kmeans clustering
k.m <- kmeans(x, cent, iter.max = 10000)
# Return the Kmeans object
return(list(k.m = k.m,
tree = hc))
}
## Create initial cluster solutions for Hospital Groups
## Prepare data and create data holders
#% Note: All possible numbers of clusters are considered
#% from 2 to n1.
# Define the range of cluster solutions to evaluate (the set k)
# if(!is.null(max_clusters)){
# ifelse(max_clusters > length(grep("^N", names(p.CI)))-1,
# cl.max <- length(grep("^N", names(p.CI)))-1,
# cl.max <- max_clusters)
# } else {
cl.max <- length(grep("^N", names(p.CI)))-1
# }
# cl.max <- nrow(p.CI.3yr)-1
clusters <- c(1:cl.max)
# Create an empty holder for cluster statistics
wss <- bss <- r2 <- incF <- SingHosp <- MaxSize <- lowLI <- nohosp <- rep(0, length(clusters))
k.data.pat <- cbind(clusters, wss, bss, r2, incF, SingHosp, MaxSize, lowLI, nohosp)
# Get number of data attributes in table (columns)
col.max <- ncol(p.CI)
# Conduct Kmeans + Ward's for all cluster solutions
for (K in 1:length(clusters)) {
# Use Kmeans + Wards method to create clusters
Kclust <- kmeans.ward(p.CI[,2:col.max], clusters[K])$k.m
# Write cluster statistics to data holder
# Within sum of squares
k.data.pat[K,2] <- Kclust$tot.withinss
# Between sum of squares
k.data.pat[K,3] <- Kclust$betweenss
# R^2
k.data.pat[K,4] <- 1-(Kclust$tot.withinss/Kclust$totss)
# Number of single hosp clusters
table.c <- table(Kclust$cluster)
k.data.pat[K,6] <- sum(table.c == 1)
# Maximum size of any single cluster
k.data.pat[K,7] <- max(table.c)
cat(".")
# LI and number of interventions
nfrom <- Kclust$cluster[match(ofrom, p.CI[,1])]
nto <- Kclust$cluster[match(oto, p.CI[,1])]
nflow <- flows(nfrom, factor(nto, levels = unique(Kclust$cluster)))
nflow <- nflow[nflow$from == nflow$to, ]
s <- sum(nflow$N_to < min_interventions)
li <- sum(nflow$prop_from < li_threshold)
k.data.pat[K,8] <- li
k.data.pat[K,9] <- s
}
# Convert data holder to dataframe
k.data.pat <- as.data.frame(k.data.pat)
## Calculate incremental F scores
n.obs <- nrow(p.CI)
for (i in 2:length(clusters)) {
k.data.pat$incF[i] <- ((k.data.pat$r2[i] - k.data.pat$r2[i-1])/(k.data.pat$clusters[i] - k.data.pat$clusters[i-1])) / ((1-k.data.pat$r2[i])/((n.obs)-(k.data.pat$clusters[i]-1)))
}
## Select the number of Hospital Groups using the heuristic
# Find local maxima in incremental F scores
# Make variable of the last candidate solution to evaluate for maxima
i <- cl.max - 1
# Find the local maxima
if(peaks == "dela") {
incF.peaks <- which(k.data.pat$incF[3:i] > k.data.pat$incF[2:(i-1)] & k.data.pat$incF[3:i] > k.data.pat$incF[4:(i+1)])+2
}
if(peaks == "diff"){
incF.peaks <- which(diff(sign(diff(k.data.pat$incF))) == -2) + 1
}
# Subset initial candidate solutions
candidates <- k.data.pat[incF.peaks,]
# Remove solutions wherein a single cluster has more than 20 hospitals
# candidates <- candidates[candidates$MaxSize < 20,]
# Subset solutions to those with the "minimum" number of single hospital Hospital Groups from remaining solutions
candidates <- candidates[candidates$SingHosp == min(candidates$SingHosp), ]
# Remove solutions with regions without receiving interventions
candidates <- candidates[candidates$nohosp == 0,]
# Remove solutions with regions with lot LI
candidates <- candidates[candidates$lowLI == 0,]
# From the remaining solutions select the solution with most clusters, K
solution <- candidates[candidates$clusters == max(candidates$clusters), ]
# Get number of clusters
n.clusters <- solution$clusters
# Use Kmeans + Wards method to recreate clusters
#% Note: Only the cluster statistics were kept in the
#% initial clustering process. The final cluster
#% solution is recreated to extract Hospital Group
#% membership and cluster center information.
#% Because the clustering algorithm provides
#% deterministic results, this clustering
#% configuration will be identical to the one
#% formed in the intial clustering process.
HG.solution <- kmeans.ward(p.CI[,2:col.max], n.clusters)
lkup <- data.frame(region = p.CI[,1], hsa = HG.solution$k.m$cluster)
nfrom <- lkup$hsa[match(ofrom, lkup$region)]
nto <- lkup$hsa[match(oto, lkup$region)]
# HSA names
# T <- table(oto, nto)
# names <- table(clu1$original_data$to, clu1$reassigned_data$to)
# names <- rownames(T)[apply(T, 2, function(x) which(x == max(x)))]
lkup1 <- data.frame(region = p.CI[,1], hsa = HG.solution$k.m$cluster) #, hsa_name = names[HG.solution$k.m$cluster])
ID <- function(shp)sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))
lkup <- data.frame(region = ID(shp))
lkup$hsa <- lkup1$hsa[match(lkup$region, lkup1$region)]
nfrom <- lkup$hsa[match(ofrom, lkup$region)]
nto <- lkup$hsa[match(oto, lkup$region)]
nflowo <- flows(factor(nfrom, levels = unique(lkup$hsa)), factor(nto, levels = unique(lkup$hsa)))
nflow <- nflowo[nflowo$from == nflowo$to, ]
nshp <- unionSpatialPolygons(shp, lkup$hsa)
u <- unique(lkup)
nshp <- SpatialPolygonsDataFrame(nshp,
data = data.frame(hsa = ID(nshp),
cluster = u$hsa[match(ID(nshp), u$hsa)],
row.names = ID(nshp)))
nshp <- spChFIDs(nshp, nshp@data$hsa)
out <- list(kw = HG.solution,
lookup = lkup,
original_data = list(from = from, to = to),
reassigned_data = list(from = nfrom, to = nto),
li = nflow$prop_from,
li_threshold = li_threshold,
interventions = nflow$N_to,
flow = nflowo,
tree = HG.solution$tree,
scree = k.data.pat$wss,
original_shp = shp,
shp = nshp
)
class(out) <- "hsa.clust"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.