R/LID.R

Defines functions LID

Documented in LID

#' Calculate dispersion indexes according to a given set of standards
#' and expectations (Mejia Ramon and Munson 2023), obtaining group, non-group, 
#' and total values for local observations and the global dataset.
#' 
#' The output list can be passed to \code{\link[lbmech]{inferLID}} to determine
#' whether the values are locally and globally higher or lower than would be 
#' expected if other values were randomly distributed among remaining observations.
#' 
#' @title Local Indicators of Dispersion
#' @param x A vector values
#' @param w A weights matrix of dimensions \code{length(x) x length(x)}
#' representing that a given observation \code{j} (along the columns)
#' is a part of \code{i} (along the rows)'s group. This can be the output of the 
#' \code{\link[lbmech]{makeWeights}} function. 
#' @param index A character string, either 'gini' (the default), or 'inoua', representing
#' whether distances are calculated in L1 or L2 space, respectively. Alternatively,
#' a numeric representing to what value distances and means are raised to when the
#' index is calculated. \code{index = 1} for Gini, and \code{index = 2} for Inoua. 
#' @param expect Either a character string or a matrix with dimensions \code{length(x) x length(x)},
#' representing the expectation value from which errors are calculated for each observation
#' pair between \code{i} and \code{j}. If \code{expect = 'self'}, the expectation is calculated as \code{i};
#' if \code{expect = 'local'}, the expectation is the neighborhood weighted mean; if 
#' if \code{expect = 'global'}, the expectation is the global mean. Expectations that depend on other
#' metrics (including hypothesis-driven that do not depend on the observed dataset) can
#' be provided by using an appropriate matrix. 
#' @param standard Either a character string or a matrix with dimensions \code{length(x) x length(x)},
#' representing the standard by which errors are judged by each observation
#' pair between \code{i} and \code{j}. If \code{standard = 'self'}, the standard is calculated as \code{i}; if 
#' \code{standard = 'other'}, the standard is calculated as \code{j};
#' if \code{standard = 'local'}, the standard is the neighborhood weighted mean; if 
#' if \code{standard = 'global'}, the standard is the global mean. Standards that depend on other
#' metrics (including hypothesis-driven that do not depend on the observed dataset) can
#' be provided by using an appropriate matrix. 
#' @param n A vector representing population weights. How much of an impact does a given 
#' observation have on any other observation regardless of its influence as provided
#' for in \code{w}. Default is \code{1} for all. 
#' @param mle Character string identifying the maximum likelihood estimator to be used. 
#' Default is \code{mle = 'mean'} for the traditional Gini, although since it uses mean absolute error
#' and implies Laplace distribution, \code{mle = 'median'} is recommended. Alternatively,
#' \code{index = 'inoua'} and \code{mle = 'mean'} for Gaussian processes. 
#' @param fun.name If \code{index != c('gini','inoua',1,2)}, how should the function
#' be named? Default is \code{fun.name = paste0(index,'q')}.
#' @param type A character string, either the name or corresponding code of 
#' a particular standard-expectation pair, as defined in #Link to Mejia Ramon and Munson 2023#
#' @param max.cross When processing, what is the maximum number of rows that 
#' an internal data.table can have? This is generally not a concern unless
#' the number of observations approaches \code{sqrt(.Machine$integer.max)}--usually
#' about 2^31 for most systems. Lower values result in a greater number of chunks
#' thus allowing larger data.sets to be calculated.
#' @param canonical Should the canonical Gini or Inoua value also be calculated?
#' Default is \code{FALSE}, and is ignored if \code{index > 2}.
#' @param pb Logical. Should a progress bar be displayed? Default is \code{FALSE}, although
#' if a large dataset is processed that requires adjusting \code{max.cross} this can
#' be useful
#' @param clear.mem Logical. Should \code{\link[base]{gc}} be run in the middle of the 
#' calculation? Default is \code{clear.mem} but set as \code{TRUE} if memory limits are a concern. 
#' @importFrom data.table CJ
#' @importFrom data.table data.table
#' @return A list with the following entries:
#' 
#' (1) \code{$index} A named character string with the code of the index, named with its name
#' 
#' (2) \code{$local} A data.table, with three columns: \code{J_Gi}, the local group dispersion
#' index; \code{J_NGi}, the local non-group dispersion index; and \code{J_i}, the local
#' total dispersion index. Rows are in the same order as the input vector. This data.table
#' also contains the chosen expectations and standards as hidden attributes to be used by
#' \code{\link[lbmech]{inferLID}}.
#'  
#' (3) \code{$global} A list with three entries: \code{$J_G}, the global group dispersion index;
#' \code{$J_NG}, the global nongroup dispersion index; and \code{$J}, the global
#' total dispersion index. 
#' 
#' (4) \code{$canonical} The canonical Gini or Inoua index, if \code{canonical = TRUE} and 
#' \code{index < 3}. 
#' @examples 
#' 
#' # Generate dummy observations
#' x <- runif(10, 1, 100)
#' 
#' # Get distance matrix
#' dists <- dist(x)
#' 
#' # Get fuzzy weights considering 5 nearest neighbors based on 
#' # inverse square distance
#' weights <- makeWeights(dists, bw = 5, 
#'                        mode = 'adaptive', weighting = 'distance',
#'                        FUN = function(x) 1/x^2, minval = 0.1,
#'                        row.stand = 'fuzzy')
#'                        
#' # Obtain the 'local gini' value
#' lid <- LID(x, w = weights, index = 'gini', type = 'local')
#' @export      
LID <- function(x, w, index = 'gini', expect = 'self', standard = 'global',
                n = rep(1, length(x)), mle = 'mean',
                fun.name = paste0(index,'q') ,type = 'spatial', 
                max.cross = .Machine$integer.max, canonical = FALSE,
                pb = FALSE, clear.mem = TRUE){
  # This bit to silence CRAN warnings
  Error=val_j=val_i=n_g=Denom=n_ng=G_Denom=NG_Denom=stand=G_Error=NG_Error=NULL
  J_i=J_Gi=..n=J_NGi=var=..subset_x=..subset_n=..x=..sub_range=NULL
  # In what L-space are we working? If the user declares 'gini' or 'inoua', then
  # override function name and standard transformation entries
  if (index == 'gini' | index == 1){
    fun.name <- 'gini'
    index <- abs
    stand.trans = function(x) x
  } else if ((index == 'inoua' | index == 2)) {
    fun.name <- 'inoua'
    index <- function(x) x^2
    stand.trans = function(x) x^2
  } else {
    stand.trans = function(x) x^index
    index <- function(x) abs(x)^index
    canonical <- FALSE
  }
  fun.code <- toupper(substr(fun.name,1,1))
  
  # Calculate umber of subsets needed based on computational limits
  n_subsets <- ceiling(length(x)/floor(sqrt(max.cross)))
  subset_size <- floor(sqrt(max.cross))
  
  # Global value will be progressively added these values
  G <- 0
  NG <- 0
  
  # What's the name-code-expectation-standard relationship?
  combs <- data.table(Type = c('Spatial', 'Relative', 'Humble', 'Selfish',
                               'Upscaled', 'Local', 'Local-Radical','Local-Critical',
                               'Absolute', 'Downscaled','Global-Radical','Global-Critical'),
                      Code = c('S','R','H','E',
                               'U','L','Y','D',
                               'A','W','X','C'),
                      Expect = c('self','self','self','self',
                                 'local','local','local','local',
                                 'global','global','global','global'),
                      Standard = c('global','local','other','self',
                                   'global','local','other','self',
                                   'global','local','other','self'))
  
  # If a type is declared, override other decisions
  if (toupper(type) %in% toupper(combs$Type)){
    expect <- combs[toupper(type) == toupper(combs$Type)]$Expect
    standard <- combs[toupper(type) == toupper(combs$Type)]$Standard
  } else if (toupper(type) %in% toupper(combs$Code)){
    expect <- combs[toupper(type) == toupper(combs$Code)]$Expect
    standard <- combs[toupper(type) == toupper(combs$Code)]$Standard
  }
  
  # Non-group weights are inverse of group
  nw <- 1 - w
  
  # Population weights are multiplied by probability
  w <- t(n * t(w))
  nw <- t(n * t(nw))
  
  # Local values are appended to this data table 
  gini_list <- data.table()
  
  # Progress bar is optionsl
  if (pb) pb1 <- txtProgressBar(min=1,max=n_subsets+1,style=3)
  
  for (k in seq(n_subsets)){
    # Iterate over every subset, using as large a number of 'i's as we can, calculating
    # their relationship to all 'j's. To get the total value, we just add the 
    # values to a global index and local data.table as we go along. 
    if(pb) setTxtProgressBar(pb1,k)
    
    # Get the needed values for this subset
    sub_range <- ((k-1)*subset_size+1):min(k*subset_size, length(x))
    
    # Value is calculated from a cross-join
    dt <- CJ(J = x, I = x[sub_range], sorted = FALSE)
    dt <- data.table(I=(1:length(sub_range))[row(w[sub_range,])],
                     J=(1:length(x))[col(w[sub_range,])],
                     w=c(w[sub_range,]), 
                     nw = c(nw[sub_range,]),
                     val_i = dt$I, val_j = dt$J)
    
    # Number of neighbors is the sum of group (and nongroup) weights
    dt[, `:=`(n_g = sum(w), n_ng = sum(nw)),by='I']
    
    # See the above table, but this giant chunk is just the linear algebra
    # needed to get the correct combination of expectation and standard for 
    # a particular choice. Additionally, it allows the user to deal with matrix
    # inputs. There's probably a less-verbose way of handling this, but this was
    # the quickest I could code quickly. 
    #
    # We start by overriding user-selected types and codes if non-matrix expectations and
    # standards were provided. It then calculates the error term based on the index
    # function, then denominator based on the chosen standard, and the total value
    # based on the expectation. Generally, the 'by' slot of the data.table is only
    # needed if a 'local' value is needed
    # 
    # If a matrix is provided for one or both of an expectation, then whatever the user
    # put into the 'type' and 'code' fields becomes dominant. 
    if (expect == 'self' & standard == 'global'){
      type <- 'Spatial'
      code <- 'S'
      dt[, Error := index(val_j - val_i)
      ][, `:=`(Denom = 2 * average(stand.trans(val_j), type = mle))
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'self' & standard == 'local'){
      type <- 'Relative'
      code <- 'R'
      dt[, Error := index(val_j - val_i)
      ][, `:=`(G_Denom = 2 * average(stand.trans(val_j), w = w, type = mle),
               NG_Denom = 2 * average(stand.trans(val_j)), w = nw, type = mle), by = 'I'
      ][, `:=`(J_Gi = w/n_g * Error/G_Denom,
               J_NGi = nw/n_ng * Error/NG_Denom)]
    } else if (expect == 'self' & standard == 'other'){
      type <- 'Humble'
      code <- 'H'
      dt[, Error := index(val_j - val_i)
      ][, `:=`(Denom = 2 * stand.trans(val_j)),
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'self' & standard == 'self'){
      type <- 'Selfish'
      code <- 'E'
      dt[, Error := index(val_j - val_i)
      ][, `:=`(Denom = 2 * stand.trans(val_i))
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'self' & methods::is(standard,'matrix')){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      
      standard <- data.table(J=(1:length(x))[row(standard)],
                             I=(1:length(x))[col(standard)],
                             stand=c(standard))
      dt <- merge(dt,standard,by=c('I','J'))
      dt[, Error := index(val_j - val_i)
      ][, `:=`(Denom = 2 * stand.trans(stand))
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if ((expect == 'local') & (standard == 'global')){
      type <- 'Upscaled'
      code <- 'U'
      dt[, `:=`(G_Error = index(val_j - average(val_j, w = w, type = mle)),
                NG_Error = index(val_j - average(val_j, w = nw, type = mle))), by = 'I'
      ][, `:=`(Denom = 2 * average(stand.trans(val_j), type = mle))
      ][, `:=`(J_Gi = w/n_g * G_Error/Denom,
               J_NGi = nw/n_ng * NG_Error/Denom)]
    } else if (expect == 'local' & standard == 'local'){
      type <- 'Local'
      code <- 'L'
      dt[, `:=`(G_Error = index(val_j - average(val_j, w = w, type = mle)),
                NG_Error = index(val_j - average(val_j, w = nw, type = mle))), by = 'I'
      ][, `:=`(G_Denom = 2 * average(stand.trans(val_j), w = w, type = mle),
               NG_Denom = 2 * average(stand.trans(val_j), w = nw, type = mle)), by = 'I'
      ][, `:=`(J_Gi = w/n_g * G_Error/G_Denom,
               J_NGi = nw/n_ng * NG_Error/NG_Denom)]
    } else if (expect == 'local' & standard == 'other'){
      type <- 'Local-Radical'
      code <- 'Y'
      dt[, `:=`(G_Error = index(val_j - average(val_j, w = w, type = mle)),
                NG_Error = index(val_j - average(val_j, w = nw, type = mle))), by = 'I'
      ][, `:=`(Denom = 2 * stand.trans(val_j)),
      ][, `:=`(J_Gi = w/n_g * G_Error/Denom,
               J_NGi = nw/n_ng * NG_Error/Denom)]
    } else if (expect == 'local' & standard == 'self'){
      type <- 'Local-Critical'
      code <- 'D'
      dt[, `:=`(G_Error = index(val_j - average(val_j, w = w, type = mle)),
                NG_Error = index(val_j - average(val_j, w = nw, type = mle))), by = 'I'
      ][, `:=`(Denom = 2 * stand.trans(val_i)),
      ][, `:=`(J_Gi = w/n_g * G_Error/Denom,
               J_NGi = nw/n_ng * NG_Error/Denom)]
    } else if (expect == 'local' & methods::is(standard,'matrix')){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      standard <- data.table(J=(1:length(x))[row(standard)],
                             I=(1:length(x))[col(standard)],
                             stand=c(standard))
      dt <- merge(dt,standard,by=c('I','J'))
      dt[, `:=`(G_Error = index(val_j - average(val_j, w = w, type = mle)),
                NG_Error = index(val_j - average(val_j, w = nw, type = mle))), by = 'I'
      ][, `:=`(Denom = 2 * stand)
      ][, `:=`(J_Gi = w/n_g * G_Error/Denom,
               J_NGi = nw/n_ng * NG_Error/Denom)]
    } else if (expect == 'global' & standard == 'global'){
      type <- 'Absolute'
      code <- 'A'
      dt[, Error := index(val_j - average(val_j,type = mle))
      ][, `:=`(Denom = 2 * average(stand.trans(val_j), type = mle))
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'global' & standard == 'local'){
      type <- 'Downscaled'
      code <- 'W'
      dt[, Error := index(val_j - average(val_j, type = mle))
      ][, `:=`(G_Denom = 2 * average(stand.trans(val_j), w = w, type = mle),
               NG_Denom = 2 * average(stand.trans(val_j), w = nw, type = mle)), by = 'I'
      ][, `:=`(J_Gi = w/n_g * Error/G_Denom,
               J_NGi = nw/n_ng * Error/NG_Denom)]
    } else if (expect == 'global' & standard == 'other'){
      type <- 'Global-Radical'
      code <- 'X'
      dt[, Error := index(val_j - average(val_j, type = mle))
      ][, `:=`(Denom = 2 * stand.trans(val_j)),
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'global' & standard == 'self'){
      type <- 'Global-Critical'
      code <- 'C'
      dt[, Error := index(val_j - average(val_j, type = mle))
      ][, `:=`(Denom = 2 * stand.trans(val_i)),
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (expect == 'global' & methods::is(standard,'matrix')){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      standard <- data.table(J=(1:length(x))[row(standard)],
                             I=(1:length(x))[col(standard)],
                             stand=c(standard))
      dt <- merge(dt,standard,by=c('I','J'))
      dt[, Error := index(val_j - average(val_j, type = mle))
      ][, `:=`(Denom = 2 * stand)
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (methods::is(expect,'matrix') & standard == 'global'){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      expect <- data.table(J=(1:length(x))[row(expect)],
                           I=(1:length(x))[col(expect)],
                           stand=c(expect))
      dt <- merge(dt,expect,by=c('I','J'))
      dt[, Error := index(val_j - expect)
      ][, `:=`(Denom = 2 * average(stand.trans(val_j), type = mle))
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (methods::is(expect,'matrix') & standard == 'local'){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      expect <- data.table(J=(1:length(x))[row(expect)],
                           I=(1:length(x))[col(expect)],
                           stand=c(expect))
      dt <- merge(dt,expect,by=c('I','J'))
      dt[, Error := index(val_j - expect)
      ][, `:=`(G_Denom = 2 * average(stand.trans(val_j), w = w, type = mle),
               NG_Denom = 2 * average(stand.trans(val_j), w = nw, type = mle)), by = 'I'
      ][, `:=`(J_Gi = w/n_g * Error/G_Denom,
               J_NGi = nw/n_ng * Error/NG_Denom)]
    } else if (methods::is(expect,'matrix') & standard == 'other'){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      expect <- data.table(J=(1:length(x))[row(expect)],
                           I=(1:length(x))[col(expect)],
                           stand=c(expect))
      dt <- merge(dt,expect,by=c('I','J'))
      dt[, Error := index(val_j - expect)
      ][, `:=`(Denom = 2 * stand.trans(val_j)),
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (methods::is(expect,'matrix') & standard == 'self'){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      expect <- data.table(J=(1:length(x))[row(expect)],
                           I=(1:length(x))[col(expect)],
                           stand=c(expect))
      dt <- merge(dt,expect,by=c('I','J'))
      dt[, Error := index(val_j - expect)
      ][, `:=`(Denom = 2 * stand.trans(val_i)),
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else if (methods::is(expect,'matrix') & methods::is(standard,'matrix')){
      type <- stringr::str_to_sentence(type)
      code <- substr(type,1,1)
      expect <- data.table(J=(1:length(x))[row(expect)],
                           I=(1:length(x))[col(expect)],
                           stand=c(expect))
      dt <- merge(dt,expect,by=c('I','J'))
      standard <- data.table(J=(1:length(x))[row(standard)],
                             I=(1:length(x))[col(standard)],
                             stand=c(standard))
      dt <- merge(dt,standard,by=c('I','J'))
      dt[, Error := index(val_j - expect)
      ][, `:=`(Denom = 2 * stand)
      ][, `:=`(J_Gi = w/n_g * Error/Denom,
               J_NGi = nw/n_ng * Error/Denom)]
    } else {
      stop("Unknown expectation or standard")
    }
    
    # The local values are the weighted judgements cast by each i 
    dt[,J_i := (J_Gi * n_g/sum(..n) + J_NGi * n_ng/sum(..n))]
    gini <- dt[,lapply(.SD,sum),.SDcols = c("J_Gi","J_NGi","J_i"),by='I'
    ][,.SD,.SDcols = c("J_Gi","J_NGi","J_i")]
    if (clear.mem){
      rm(dt)
      gc()
    }
    gini[, var := ..x[..sub_range]][, n := ..n[..sub_range]]
    
    # Append values to the table
    gini_list <- rbind(gini_list, gini)
    if (clear.mem){
      rm(gini)
      gc()
    }
  }
  gini <- gini_list
  rm(gini_list)
  if (clear.mem) gc()
  
  # If a matrix was provided, define the variable that will be added as an 
  # attribute to the output local data.table
  if (methods::is(standard,'matrix')){
    standard <- 'matrix'
  } 
  if (methods::is(expect,'matrix')){
    expect <- 'matrix'
  }
  
  # Index name is the index name plus the type, likewise for the code
  index_name <- paste0(code,fun.code)
  names(index_name) <- paste(type,stringr::str_to_sentence(fun.name))
  
  # Assign the names, expectations, and standards to the output table
  attributes(gini)[c("function","expectation","standard","mle")] <- c(fun.name,expect,standard,mle)
  
  # Output list contains the name, the local table, and the global values
  # (which themselves are just weighted means)
  out <- list(index = index_name,
              local = gini,
              global = list(
                J_G = average(gini$J_Gi, gini$n),
                J_NG = average(gini$J_NGi, gini$n),
                J = average(gini$J_i, gini$n)))
  
  # Calculate the canonical value if appropriate, add to list
  if (canonical){
    out$canonical <- as.numeric(dispersionIndex(x, 
                                                w = n,
                                                index= stringr::str_remove(
                                                  tolower(names(index_name)),
                                                  "[a-z\\-]+ "),
                                                max.cross = max.cross))
  }
  if(pb) setTxtProgressBar(pb1,k+1)
  if (clear.mem) gc()
  return(out)
  
}
andresgmejiar/lbmech documentation built on Feb. 2, 2025, 12:37 a.m.