#' Infer whether there exists more or less within- and between-group
#' local and global inequality than would be expected versus if for all observations
#' the values of all other observations were permuted. This tests if local values
#' are significantly above or below what is expected given the global dataset, and
#' if global values are significantly above or below what is expected given an
#' otherwise random distribution.
#'
#' The output list can be passed to \code{\link[lbmech]{scatterLID}} to plot
#' the group and non-group components of local inequality based on the significance
#' classes.
#'
#' @title Infer if dispersion is significant
#' @param lid The list output from the \code{\link[lbmech]{LID}} function.
#' @param w The same spatial weights matrix used in calculating the \code{lid} input.
#' @param ntrials The number of permutations to perform. Default is 999.
#' @param alpha Threshold for significance. Default is \code{alpha = 0.05}.
#' @param standard The standards matrix with dimensions \code{length(x) x length(x)} used
#' when calculating \code{lid}. Ignored if none had been originally provided, otherwise
#' required.
#' @param expect The expectations matrix with dimensions \code{length(x) x length(x)} used
#' when calculating \code{lid}. Ignored if none had been originally provided, otherwise
#' required.
#' @param multicore Should multithreading be used? Default is \code{FALSE}.
#' @param ncores If \code{multicore = TRUE}, how many cores should be used? Default
#' is \code{ncores = parallel::detectCores() - 1}.
#' @param var.stand Logical. Should the standards be permuted if a matrix was
#' provided? Default is \code{FALSE}.
#' @param var.exp Logical. Should the expectations be permuted if a matrix was
#' provided? Default is \code{FALSE}.
#' @param ng.invert Does a higher non-group value imply higher between group inequality?
#' Default is \code{TRUE}. This is ignored if matrixes were not originally provided, as
#' it is automatically performed.
#' @param max.cross When processing, what is the maximum number of rows that
#' an internal data.table can have? This is generally not a concern unless
#' the number of observations approaches \code{sqrt(.Machine$integer.max)}--usually
#' about 2^31 for most systems. Lower values result in a greater number of chunks
#' thus allowing larger data.sets to be calculated.
#' @param pb Logical. Should a progress bar be displayed? Default is \code{pb = FALSE}, although
#' if a large dataset is processed that requires adjusting \code{max.cross} this can
#' be useful. Ignored if \code{multicore = TRUE}.
#' @param clear.mem Logical. Should \code{\link[base]{gc}} be run in the middle of the
#' calculation? Default is \code{clear.mem} but set as \code{TRUE} if memory limits are a concern.
#' @importFrom data.table fifelse
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom data.table copy
#' @return A list with the following entries:
#'
#' (1) \code{$local} A data.table with one column, indicating whether an observation is
#' falls in one of nine categories: Out-group High, Average, or Low for between-group
#' inequality, and In-group High, Average, or Low for within-group inequality based on
#' the significance according to the delta-J statistic in the \code{$stats} data.table.
#'
#' (2) \code{$global} A list with four entries, \code{$J_G} for the group component of the
#' global inequality, \code{$J_NG} for the nongroup, \code{$J} for the total,
#' and $Class, containing the significance class for the global dataset. Each
#' of the first three entries themselves contain three entriesL
#' \code{$delta}, representing the delta-J statistic, \code{$p}, representing its p-value,
#' and $Class, containing the group/non-group class
#'
#' (3) \code{$stats} A data.table containing the number of permutations a randomly-calculated
#' \code{$J_Gi}, \code{$J_NGi}, or \code{$J_i} was above or below the real value
#' @examples
#'
#' # Generate dummy observations
#' x <- runif(10, 1, 100)
#'
#' # Get distance matrix
#' dists <- dist(x)
#'
#' # Get fuzzy weights considering 5 nearest neighbors based on
#' # inverse square distance
#' weights <- makeWeights(dists, bw = 5,
#' mode = 'adaptive', weighting = 'distance',
#' FUN = function(x) 1/x^2, minval = 0.1,
#' row.stand = 'fuzzy')
#'
#' # Obtain the 'local gini' value
#' lid <- LID(x, w = weights, index = 'gini', type = 'local')
#'
#' # Infer whether values are significant relative to the spatial distribution
#' # of the neighbots
#' inference <- inferLID(lid, w = weights, ntrials = 100)
#' @export
inferLID <- function(lid, w, ntrials = 999, alpha = 0.05,
standard = NULL, expect = NULL,
multicore = FALSE, ncores = parallel::detectCores() - 1,
var.stand = FALSE, var.exp = FALSE, ng.invert = TRUE,
max.cross =.Machine$integer.max, pb = TRUE,
clear.mem = FALSE){
# This bit to silence CRAN warnings
Trial=..xrand=id=J_Gi=J_NGi=J_i=n=GroupClass=dGini_MC=dGini=NULL
NonGroupClass=dNonGroup_MC=dNonGroup=.N=p=N=IndexClass=Class=NULL
# Extract the values from the lid list object
x <- copy(as.data.table(lid$local))
x$id <- 1:nrow(x)
agg <- lid$global
index <- attributes(lid$local)$`function`
mle <- attributes(lid$local)$`mle`
# Make sure the user provided the right matrices
if (attributes(lid$local)$`standard` == 'matrix'){
if (is.null(standard)){ stop('Matrix must be provided if original standard was also a matrix')}
} else {
standard <- attributes(lid$local)$`standard`
}
if (attributes(lid$local)$`expectation` == 'matrix'){
if (is.null(expect)) {stop('Matrix must be provided if original expectation was also a matrix')}
} else {
expect <- attributes(lid$local)$`expectation`
}
# Create a table to which we'll add the statistics for each permutation
ptable <- data.table(Trial = rep(1:ntrials,each=nrow(x)))
if (pb) pb1 <- txtProgressBar(min=0,max=ntrials+1,style=3)
if (!multicore){
for (i in 1:ntrials){
if (pb) setTxtProgressBar(pb1,i)
# If you also vary the standard and expectations, do so
if (var.stand){
standard <- sample(standard)
}
if (var.exp){
expect <- sample(expect)
}
# Shuffling your neighbors but keeping the locations as the only
# possible places is effectively shuffling the matrix row-wise while
# keeping the diagonal constant
w_mask <- row(w) != col(w)
w[w_mask] <- stats::ave(w[w_mask], row(w)[w_mask], FUN = sample)
# Perform the LID analysis on the randomized weights
xrand <- data.table(id = x$id,
LID(x$var, w = w, n = x$n, index = index, mle = mle,
standard = standard,
expect = expect,
max.cross = max.cross,
clear.mem = clear.mem)$local)
if (clear.mem) gc()
# Append results to permutation table
ptable[Trial == i, names(xrand) := ..xrand]
if (clear.mem){
rm(xrand)
gc()
}
}
} else {
# Activate the parallel cores
cl <- parallel::makeCluster(ncores)
# Same loop as above, but as a function
perm_function <- function(i) {
if (var.stand) {
standard <- sample(standard)
}
if (var.exp) {
expect <- sample(expect)
}
# Assume w is defined elsewhere or passed as an argument
w_mod <- w
w_mod[w] <- stats::ave(w[w], row(w)[w], FUN = sample)
# Assuming x is defined in your global environment and visible here
xrand <- data.table(id = x$id,
LID(x$var, w = w_mod, n = x$n, index = index, mle = mle,
standard = standard, expect = expect, max.cross = max.cross,
clear.mem = clear.mem)$local)
if (clear.mem) {
gc() # Garbage collection if necessary
}
return(xrand)
}
# Send the variables to the cores
parallel::clusterEvalQ(cl, {
library(data.table)
})
parallel::clusterExport(cl, c("x", "w",
"var.stand", "var.exp",
"standard", "expect",
"index", "mle",
"max.cross", "clear.mem"),
envir=environment())
# Create a table to which we'll add the statistics for each permutation
ptable <- data.table(Trial = rep(1:ntrials, each = nrow(x)))
# Perform parallel computation
results <- parallel::parLapply(cl, 1:ntrials, perm_function)
# Combining results into the ptable
for (i in seq_along(results)) {
ptable[Trial == i, names(results[[i]]) := results[[i]]]
}
# Stop the cluster
parallel::stopCluster(cl)
# Optional: Cleanup if clear.mem is TRUE
if (clear.mem) {
rm(results)
gc()
}
}
# Significance for within-group inequality is defined based on the
# delta-J_Gi statistic, which is simply the group component minus the non-group
# component
group <- merge(ptable[,.(id, dGini_MC = J_Gi - J_NGi)],
x[,.(id, dGini = J_Gi - J_NGi)],
allow.cartesian=TRUE)
# Significance for between-group inequality is defined based on the
# delta-J_NGi statistic, which is simply the non-group component minus the mean
# total inequality
nongroup <- merge(ptable[,.(id, dNonGroup_MC = J_NGi - sum(J_i * n,na.rm=TRUE)/sum(n,na.rm=TRUE)),by='Trial'],
x[,.(id, dNonGroup = J_NGi - sum(J_i*n,na.rm=TRUE)/sum(n,na.rm = TRUE))],
allow.cartesian=TRUE)
# Combine into one table
gini <- cbind(nongroup,group[,-1])
# The within-group class is easy to infer, a higher value always means higher
# inequality
gini[, GroupClass := fifelse(dGini_MC > dGini, 'In-group Low','In-group High')]
# The between group is a bit more difficult, and generally depends on whether
# the self is being included in the non-group, or whether it's being excluded
if ((standard != 'self' & expect != 'self') |
((standard == 'matrix' | expect == 'matrix') & ng.invert)){
gini[, NonGroupClass := fifelse(dNonGroup_MC > dNonGroup, 'Out-group High','Out-group Low')]
} else if ((standard == 'self' | expect == 'self') |
((standard == 'matrix' | expect == 'matrix') & !ng.invert)){
gini[, NonGroupClass := fifelse(dNonGroup_MC > dNonGroup, 'Out-group Low','Out-group High')]
}
# Calculate p values
groupInference <- gini[,.N,by=c('id',"GroupClass")
][,p := 1 - (1+N)/(1+sum(N)),by='id'][]
nongroupInference <- gini[,.N,by=c('id',"NonGroupClass")
][,p := 1 - (1+N)/(1+sum(N)),by='id'][]
# Out-group p values are based on how extreme the permuted global value
# is relative to the actual one
globalp <- ptable[,.(J_Gi = average(J_Gi, w = x$n),
J_NGi = average(J_NGi, w = x$n),
J_i = average(J_i, w = x$n)),by='Trial'
][, GroupClass := fifelse(agg$J_G > J_Gi,'High','Low')
][, NonGroupClass := fifelse(agg$J_NG > J_NGi,'High','Low')
][, IndexClass := fifelse(agg$J < J_i,'High','Low')
]
rm(ptable)
# Calculate p values
groupp <- globalp[, .N, by = c('GroupClass')
][,p := 1 - (1+N)/(1+sum(N))][]
nongroupp <- globalp[, .N, by = c('NonGroupClass')
][,p := 1 - (1+N)/(1+sum(N))][]
indexp <- globalp[, .N, by = c('IndexClass')
][,p := 1 - (1+N)/(1+sum(N))][]
# Add the p values to a list
globalp <- list(J_G = list(delta = agg$J_G - mean(globalp$J_Gi),
p = min(groupp$p)),
J_NG = list(delta = agg$J_NG - mean(globalp$J_NGi),
p = min(nongroupp$p)),
J = list(delta = agg$J - mean(globalp$J_i),
p = min(indexp$p)))
# Calculate significance classes for the complete dataset
# Within-group is straight-forward
if (!is.na(agg$J_G)){
if (globalp$J_G$p > alpha){
globalp$J_G$Class <- 'In-group Average'
} else if (globalp$J_G$delta > 0) {
globalp$J_G$Class <- 'In-group High'
} else {
globalp$J_G$Class <- 'In-group Low'
}
} else {
globalp$J_G$Class <- 'In-group NaN'
}
# Across group is again a bit complicated, and once again depends on whether
# the self is included in the comparisons.
if (!is.na(agg$J_NG)){
if (globalp$J_NG$p > alpha){
globalp$J_NG$Class <- 'Out-group Average'
} else if ((standard != 'self' & expect != 'self') |
((standard == 'matrix' | expect == 'matrix') & ng.invert)){
if (globalp$J_NG$delta > 0) {
globalp$J_NG$Class <- 'Out-group Low'
} else {
globalp$J_NG$Class <- 'Out-group High'
}
} else if ((standard == 'self' | expect == 'self') |
((standard == 'matrix' | expect == 'matrix') & !ng.invert)){
if (globalp$J_NG$delta > 0) {
globalp$J_NG$Class <- 'Out-group High'
} else {
globalp$J_NG$Class <- 'Out-group Low'
}
}
} else {
globalp$J_NG$Class <- 'Out-group NaN'
}
globalp$Class <- paste0(globalp$J_G$Class,", ",globalp$J_NG$Class)
# Add only the significant observations to a new data.table;
# if an observation is not included for a particular group/nongroup group,
# call it 'average'
out <- merge(merge(x,
groupInference[p < alpha, .(id,GroupClass)],
all=TRUE,by='id'),
nongroupInference[p < alpha, .(id,NonGroupClass)],
all=TRUE,by='id')[is.na(GroupClass), GroupClass := 'In-group Average'
][is.na(NonGroupClass), NonGroupClass := 'Out-group Average'
][, Class := paste(GroupClass,NonGroupClass,sep=', ')
][]
# Return to the original order since it tends to get shuffled
out <- out[order(id)]
out$id <- NULL
# Calculate the statistic, again...
gini[,`:=`(delta_G = dGini - dGini_MC,
delta_NG = dNonGroup - dNonGroup_MC)]
# The output table of p values only needs to include the ones above 0.5, since
# there're only two options
stats <- merge(groupInference[order(p),.SD[1],by='id'
][,.(id,Group_C = N, Group_p = p, GroupClass)],
nongroupInference[order(p),.SD[1],by='id'
][,.(id,NonGroup_C = N, NonGroup_p = p, NonGroupClass)])
stats$id <- NULL
out <- list(local = out[,.SD,.SDcols = c('Class')],
global = globalp,
stats = stats)
if (pb) setTxtProgressBar(pb1, i+1)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.