ejanalysis: Tools for Environmental Justice (EJ) Analysis

Documented in pop.ecdf.dd

#' @title Draw an Ecdf plot showing the distribution of scores in each demographic subgroup
#'
#' @description
#'  Draws a plot using [Hmisc::Ecdf()], overlaying cumulative distribution functions,
#'  one for each subgroup specified, and one for overall population.
#'  Useful to compare subgroups based on the distribution of scores among people in that subgroup
#'  using data from small places like census block groups, based on having for each place the pop total
#'  and % of pop that is in each group.
#'
#' @param e Required numeric vector of scores (e.g., environmental indicator values) for places like block groups, one place per row
#' @param elab Required character single element vector with text label for environmental score, used in xlab of plot
#' @param mydf Required data.frame where each row is a place and each column is the percent of place population that is in given demographic subgroup, one subgroup per column
#' @param dlabs Optional, charater vector that specifies text for legend, same length as number of columns in mydf
#' @param mywt Optional, default is unweighted, but normally would specify as the population counts for places analyzed
#' @param main Optional character specifying plot title at top of plot. Default title notes names of subgroups
#' @param mycolors Optional vector of colors as long as number of columns in mydf, specifies colors of lines
#' @param refcolor Optional color, default is 'black'
#' @param sorted Optional logical, default is TRUE. Should the legend be sorted in order of population weighted mean values of e?
#' @param log Default is ''. Passed to [pop.ecdf()]
#' @param ... Other optional parameters to pass to Ecdf
#' @return Draws a plot
#' @seealso [Hmisc::Ecdf()] [RR()] [pop.cdf()]   [pop.cdf2()] [pop.ecdf()]  [pop.cdf.density()]
#' @examples
#' ## #
#' \dontrun{
#' set.seed(99)
#' pctminsim=c(runif(7000,0,1), pmin(rlnorm(5000, meanlog=log(0.30), sdlog=1.7), 4)/4)
#' popsim= runif(12000, 500, 3000)
#' esim= rlnorm(12000, log(10), log(1.15)) + rnorm(12000, 1, 0.5) * pctminsim - 1
#' }
#' @export
pop.ecdf.dd <- function(e, elab, mydf, dlabs = colnames(mydf), mywt = rep(1, NROW(mydf)), main, mycolors, refcolor = 'black', sorted = TRUE, log = '', ...) {

  if (missing(elab)) {elab <- deparse(substitute(e))}
  if (missing(main)) {
    # mymain <- paste('Scores in overall population vs among\n', paste(dlabs, collapse = ', '), '')
    mymain <- 'Scores within overall population vs \nwithin a given subgroup'
  } else {
    mymain <- main
  }

  mye <- e; myelab <- elab
  refd <- rep(1, NROW(mydf))

  # Append overall ref group to subgroups
  #  cols of mydf incl refd
  #  dlabs incl refd
  # mycolors incl refd because calc after refd in mydf unless user specified
  #  wtdmeans incl refd because calculated after refd in mydf
  mydf <- data.frame(mydf, overall = refd, stringsAsFactors = FALSE)
  dlabs <- c(dlabs, 'overall')
  # wtdmeans <- sapply(mydf, function(x) weighted.mean(mye, x * mywt))
  wtdmeans <- sapply(mydf, function(x) Hmisc::wtd.mean(mye, weights = x * mywt))

  if (missing(mycolors)) {
    n <- NCOL(mydf) - 1 # all but ref group
    if (n <= 12) {
      mycolors <- c(RColorBrewer::brewer.pal(n, "Paired"), refcolor)
    } else {
      mycolors <- c(rainbow(n), refcolor)
    }
  } else {
    # user specified colors prob are missing one for ref group
    if (NCOL(mydf) == length(mycolors) ) {
      warning('using mycolors for all including overall pop so refcolor ignored')
      mycolors <- mycolors
    } else {
      # user seems to have specified right number?
      mycolors <- c(mycolors, refcolor)
    }
  }

  if (sorted) {
    # sort wtdmeans
    # sort columns of mydf
    # sort dlabs
    # sort mycolors?? no need
    ordering <- order(wtdmeans, decreasing = TRUE)
    wtdmeans <- wtdmeans[ordering]
    mydf <- mydf[ , ordering]
    dlabs <- dlabs[ordering]
    mycolors <- mycolors[ordering]
  }

  myxlim <- c(min(mye, na.rm = TRUE), max(mye, na.rm = TRUE))
  if ('x' %in% log) {
    where <- ifelse(log10(wtdmeans[dlabs == 'overall']) > (log10(myxlim[1]) + ((log10(myxlim[2]) - log10(myxlim[1])) / 4)), 'topleft', 'bottomright')
  } else {
    where <- ifelse(wtdmeans[dlabs == 'overall'] > (myxlim[1] + ((myxlim[2] - myxlim[1]) / 4)), 'topleft', 'bottomright')
  }

  for (i in 1:NCOL(mydf)) {

    myd <- mydf[ , i] #; mydlab <- dlabs[i]

    if (i == 1) {
      pop.ecdf(mye, myd, mywt,
               add = FALSE,
               allothers = FALSE,
               xlab = myelab, xlim = myxlim,
               main = mymain,
               col = mycolors[i], log = log, ...)

      legend(x = where, legend = paste(dlabs, ' (mean=', signif(wtdmeans, 4), ', ', round(wtdmeans/wtdmeans[dlabs == 'overall'], 2), 'x overall mean)', sep = ''), col = mycolors, lty = 1, bty = 'n')

    } else {
      pop.ecdf(mye, myd, mywt,
               add = TRUE,
               allothers = FALSE,
               col = mycolors[i], log = log, ...
      )
    }
    abline(v = wtdmeans[i], col = mycolors[i])
  }
}