R/correlations.R
In COINr: Composite Indicator Construction and Analysis

Documented in get_corr get_corr_flags get_cronbach get_denom_corr get_pvals

#' Correlations between indicators and denominators
#'
#' Get a data frame containing any correlations between indicators and denominators that exceed a given
#' threshold. This can be useful when *whether* to denominate an indicator and *by what* may not be obvious.
#' If an indicator is strongly correlated with a denominator, this may suggest to denominate it by that
#' denominator.
#'
#' @param coin A coin class object.
#' @param dset The name of the data set to apply the function to, which should be accessible in `.$Data`.
#' @param cor_thresh A correlation threshold: the absolute value of any correlations between indicator-denominator pairs above this
#' threshold will be flagged.
#' @param cortype The type of correlation: to be passed to the `method` argument of `stats::cor`.
#' @param nround Optional number of decimal places to round correlation values to. Default 2, set `NULL` to
#' disable.
#' @param use_directions Logical: if `TRUE` the extracted data is adjusted using directions found inside the coin (i.e. the "Direction"
#' column input in `iMeta`. See comments on this argument in [get_corr()].
#'
#' @return A data frame of pairwise correlations that exceed the threshold.
#' @export
#'
#' @examples
#' # build example coin
#' coin <- build_example_coin(up_to = "new_coin", quietly = TRUE)
#'
#' # get correlations >0.7 of any indicator with denominators
#' get_denom_corr(coin, dset = "Raw", cor_thresh = 0.7)
#'
get_denom_corr <- function(coin, dset, cor_thresh = 0.6, cortype = "pearson",
                           nround = 2, use_directions = FALSE){

  # indicator data
  # get everything at this point to ensure matching rows
  iData <- get_dset(coin, dset = dset, also_get = "all")

  if(use_directions){
    iData <- directionalise(iData, coin)
  }

  # iMeta
  iMeta <- coin$Meta$Ind

  # only the indicator data
  iData_ <- iData[iMeta$iCode[iMeta$Type == "Indicator"]]

  # only the denoms
  den_codes <- iMeta$iCode[iMeta$Type == "Denominator"]
  if(length(den_codes) == 0){
    stop("No denominators found. Denominators should be labelled as 'Denominator' in iMeta.")
  }
  if(any(den_codes %nin% names(iData))){
    stop("Denominators not found - they are present in iMeta but not found in selected data set.")
  }
  denoms <- iData[den_codes]

  # GET CORRS ---------------------------------------------------------------

  # get correlations
  corr_ind <- stats::cor(iData_, denoms, method = cortype, use = "pairwise.complete.obs")

  # make long
  crtable <- lengthen(corr_ind)


  # FIND HI CORRS -----------------------------------------------------------

  # remove self-correlations
  crtable <- crtable[crtable$V1 != crtable$V2, ]
  # remove NAs
  crtable <- crtable[!is.na(crtable$Value), ]

  # now filter to only high or low correlations
  crtable <- crtable[abs(crtable$Value) > cor_thresh, ]

  # CLEAN AND OUTPUT --------------------------------------------------------

  # col names
  colnames(crtable) <- c("Ind", "Denom", "Corr")

  # round
  if(!is.null(nround)){
    crtable$Corr <- round(crtable$Corr, nround)
  }

  # sort
  crtable <- crtable[order(crtable$Ind),]

  crtable
}


#' Find highly-correlated indicators within groups
#'
#' This returns a data frame of any highly correlated indicators within the same aggregation group. The level of the aggregation
#' grouping can be controlled by the `grouplev` argument.
#'
#' This function is motivated by the idea that having very highly-correlated indicators within the same group may
#' amount to double counting, or possibly redundancy in the framework.
#'
#' This function replaces the now-defunct `hicorrSP()` from COINr < v1.0.
#'
#' @param coin A coin class object
#' @param dset The name of the data set to apply the function to, which should be accessible in `.$Data`.
#' @param cor_thresh A threshold to flag high correlation. Default 0.9.
#' @param grouplev The level to group indicators in. E.g. if `grouplev = 2` it will look for high correlations between indicators that
#' belong to the same group in Level 2.
#' @param cortype The type of correlation, either `"pearson"` (default), `"spearman"` or `"kendall"`. See [stats::cor].
#' @param roundto Number of decimal places to round correlations to. Default 3. Set `NULL` to disable rounding.
#' @param thresh_type Either `"high"`, which will only flag correlations *above* `cor_thresh`, or `"low"`,
#' which will only flag correlations *below* `cor_thresh`.
#' @param use_directions Logical: if `TRUE` the extracted data is adjusted using directions found inside the coin (i.e. the "Direction"
#' column input in `iMeta`. See comments on this argument in [get_corr()].
#'
#' @examples
#' # build example coin
#' coin <- build_example_coin(up_to = "Normalise", quietly = TRUE)
#'
#' # get correlations between indicator over 0.75 within level 2 groups
#' get_corr_flags(coin, dset = "Normalised", cor_thresh = 0.75,
#'                thresh_type = "high", grouplev = 2)
#'
#' @return A data frame with one entry for every indicator pair that is highly correlated within the same group, at the specified level.
#' Pairs are only reported once, i.e. only uses the upper triangle of the correlation matrix.
#'
#' @export
get_corr_flags <- function(coin, dset, cor_thresh = 0.9, thresh_type = "high", cortype = "pearson",
                     grouplev = NULL, roundto = 3, use_directions = FALSE){


  # CHECKS AND DEFAULTS -----------------------------------------------------

  grouplev <- set_default(grouplev, 2)

  stopifnot(thresh_type %in% c("high", "low"))

  if(grouplev > coin$Meta$maxlev){
    stop("grouplev is greater than the maximum level of ", coin$Meta$maxlev)
  }

  stopifnot(cor_thresh <= 1,
            cor_thresh >= -1)

  iData <- get_dset(coin, dset, also_get = "none")

  if(use_directions){
    iData <- directionalise(iData, coin)
  }


  # GET CORRS ---------------------------------------------------------------

  # get correlations
  corr_ind <- stats::cor(iData, method = cortype, use = "pairwise.complete.obs")

  # make long
  crmat_melt <- lengthen(corr_ind)


  # FIND HI CORRS -----------------------------------------------------------

  # get index structure
  lin <- coin$Meta$Lineage
  # select cols corresponding to what is being correlated against what
  lin <- unique(lin[c(1,grouplev)])

  # get parent group of each of V1 and V2
  crtable <- merge(crmat_melt, lin, by.x = "V1", by.y = colnames(lin)[1])
  colnames(crtable)[ncol(crtable)] <- "P1"
  crtable <- merge(crtable, lin, by.x = "V2", by.y = colnames(lin)[1])
  colnames(crtable)[ncol(crtable)] <- "P2"

  # filter to only include entries from the same group
  crtable <- crtable[crtable$P1 == crtable$P2 ,]
  # remove self-correlations
  crtable <- crtable[crtable$V1 != crtable$V2, ]
  # remove NAs
  crtable <- crtable[!is.na(crtable$Value), ]

  # now filter to only high or low correlations
  if(thresh_type == "high"){
    crtable <- crtable[crtable$Value > cor_thresh, ]
  } else {
    crtable <- crtable[crtable$Value < cor_thresh, ]
  }

  # CLEAN AND OUTPUT --------------------------------------------------------

  # col names
  colnames(crtable) <- c("Ind1", "Ind2", "Corr", "Group", "P2")

  if(!is.null(roundto)){
    crtable$Corr <- round(crtable$Corr, roundto)
  }

  df_out <- crtable[c("Group", "Ind1", "Ind2", "Corr")]
  remove_duplicate_corrs(df_out, c("Ind1", "Ind2"))

}


#' Get correlations
#'
#' Helper function for getting correlations between indicators and aggregates. This retrieves subsets of correlation
#' matrices between different aggregation levels, in different formats. By default, it will return a
#' long-form data frame, unless `make_long = FALSE`. By default, any correlations with a p-value less than 0.05 are
#' replaced with `NA`. See `pval` argument to adjust this.
#'
#' This function allows you to obtain correlations between any subset of indicators or aggregates, from
#' any data set present in a coin. Indicator selection is performed using [get_data()]. Two different
#' indicator sets can be correlated against each other by specifying `iCodes` and `Levels` as vectors.
#'
#' The correlation type can be specified by the `cortype` argument, which is passed to [stats::cor()].
#'
#' The `withparent` argument will optionally only return correlations which correspond to the structure
#' of the index. For example, if `Levels = c(1,2)` (i.e. we wish to correlate indicators from Level 1 with
#' aggregates from Level 2), and we set `withparent = TRUE`, only the correlations between each indicator
#' and its parent group will be returned (not correlations between indicators and other aggregates to which
#' it does not belong). This can be useful to check whether correlations of an indicator/aggregate with
#' any of its parent groups exceeds or falls below thresholds.
#'
#' Similarly, the `grouplev` argument can be used to restrict correlations to within groups corresponding
#' to the index structure. Setting e.g. `grouplev = 2` will only return correlations within the groups
#' defined at Level 2.
#'
#' The `grouplev` and `withparent` options are disabled if `make_long = FALSE`.
#'
#' Note that this function can only call correlations within the same data set (i.e. only one data set in `.$Data`).
#'
#' This function replaces the now-defunct `getCorr()` from COINr < v1.0.
#'
#' @param coin A coin class coin object
#' @param dset  The name of the data set to apply the function to, which should be accessible in `.$Data`.
#' @param iCodes An optional list of character vectors where the first entry specifies the indicator/aggregate
#' codes to correlate against the second entry (also a specification of indicator/aggregate codes). If this is specified as a character vector
#' it will coerced to the first entry of a list, i.e. `list(iCodes)`.
#' @param Levels The aggregation levels to take the two groups of indicators from. See [get_data()] for details.
#' Defaults to indicator level.
#' @param ... Further arguments to be passed to [get_data()] (`uCodes` and `use_group`).
#' @param cortype The type of correlation to calculate, either `"pearson"`, `"spearman"`, or `"kendall"`.
#' @param withparent If `TRUE`, and `aglev[1] != aglev[2]`, will only return correlations of each row with its parent. Alternatively, if
#' `withparent = "family"`, will return correlations with parents, grandparents etc, up to the highest level. In both cases the data set
#' must be aggregated for this to work.
#' @param grouplev The aggregation level to group correlations by if `aglev[1] == aglev[2]`. Requires that
#' `make_long = TRUE`.
#' @param pval The significance level for including correlations. Correlations with \eqn{p > pval} will be returned as `NA`.
#' Default 0.05. Set to 0 to disable this.
#' @param make_long Logical: if `TRUE`, returns correlations in long format (default), else if `FALSE`
#' returns in wide format. Note that if wide format is requested, features specified by `grouplev`
#' and `withparent` are not supported.
#' @param use_directions Logical: if `TRUE` the extracted data is adjusted using directions found inside the coin (i.e. the "Direction"
#' column input in `iMeta`: any indicators with negative direction will have their values multiplied by -1 which will reverse the
#' direction of correlation). This should only be set to `TRUE` if the data set has not yet been normalised. For example, this can be
#' useful to set to `TRUE` to analyse correlations in the raw data, but would make no sense to analyse correlations in the normalised
#' data because that already has the direction adjusted! So you would reverse direction twice. In other words, use this at your
#' discretion.
#'
#' @importFrom stats cor
#'
#' @examples
#' # build example coin
#' coin <- build_example_coin(up_to = "new_coin", quietly = TRUE)
#'
#' # get correlations
#' cmat <- get_corr(coin, dset = "Raw", iCodes = list("Environ"),
#'                  Levels = 1, make_long = FALSE)
#'
#' @return A data frame of pairwise correlation values in wide or long format (see `make_long`).
#' Correlations with \eqn{p > pval} will be returned as `NA`.
#'
#' @seealso
#' * [plot_corr()] Plot correlation matrices of indicator subsets
#'
#' @export
get_corr <- function(coin, dset, iCodes = NULL, Levels = NULL, ...,
                     cortype = "pearson", pval = 0.05, withparent = FALSE,
                     grouplev = NULL, make_long = TRUE, use_directions = FALSE){

  # CHECKS ------------------------------------------------------------------

  check_coin_input(coin)

  # DEFAULTS ----------------------------------------------------------------

  # set Levels, repeat iCodes etc if only one input
  if(is.null(Levels)){Levels <- 1}
  if(is.null(iCodes)){iCodes <- list(NULL)}

  if(!is.list(iCodes)){
    stopifnot(is.character(iCodes))
    iCodes <- as.list(iCodes)
  }

  if (length(iCodes) == 1){
    iCodes = rep(iCodes, 2)
  }
  if (length(Levels) == 1){
    Levels = rep(Levels, 2)
  }
  if (Levels[2] > Levels [1]){
    Levels <- rev(Levels)
    iCodes <- rev(iCodes)
  }

  # catch when two different groups in same level: here we can't show groupings
  if (!setequal(iCodes[[1]], iCodes[[2]])){
    if (Levels[1] == Levels[2]) {
      grouplev <- NULL
    }
  }

  # GET FAM (RECURSIVE) -----------------------------------------------------

  if(!is.logical(withparent)){

    stopifnot(is.character(withparent),
              length(withparent)==1)

    if(withparent != "family"){
      stop("withparent not recognised - should be either logical or 'family'.")
    }

    # ignore second iCode and Level in this case
    iCodes[[2]] <- iCodes[[1]]
    Levels[2] <- Levels[1]

    lin <- coin$Meta$Lineage

    if(ncol(lin) <= Levels[1]){
      stop("If withparent = 'family', Levels[1] cannot be the top level.")
    }

    par_levs <- (Levels[1] + 1) : ncol(lin)

    cr_fam <- lapply(par_levs, function(lev){
      cmat <- get_corr(coin, dset = dset, iCodes = iCodes, Levels = c(Levels[1], lev),
               cortype = cortype, pval = pval, withparent = TRUE, grouplev = grouplev,
               make_long = TRUE, ... = ...)
      # rename to level
      cmat[1] <- names(lin)[lev]
      cmat
    })

    cr_fam <- Reduce(rbind, cr_fam)

    return(cr_fam)
  }

  # GET DATA ----------------------------------------------------------------

  # get data sets
  iData1 <- get_data(coin, dset = dset, iCodes = iCodes[[1]],
                     Level = Levels[[1]], ..., also_get = "none")
  iData2 <- get_data(coin, dset = dset, iCodes = iCodes[[2]],
                     Level = Levels[[2]], ..., also_get = "none")


  # Adjust directions -------------------------------------------------------

  if(use_directions){
    iData1 <- directionalise(iData1, coin)
    iData2 <- directionalise(iData2, coin)
  }

  # GET CORRELATIONS --------------------------------------------------------

  # get corr matrix
  crmat <- stats::cor(iData1, iData2,
                      use = "pairwise.complete.obs", method = cortype)

  # set insignificant correlations to zero if requested
  if(pval > 0){
    # p values
    p_ind <- get_pvals(cbind(iData1, iData2), method = cortype)
    # relevant part of the matrix
    p_ind2 <- p_ind[1:ncol(iData1), ((ncol(iData1)+1):ncol(p_ind))]
    # set elements of crmat to zero, where correlations are below significance level
    # when plotted, these will be white, so invisible
    crmat[p_ind2 > pval] <- NA
  }

  crmat_melt <- lengthen(crmat)
  # remove rows with NAs
  #crmat_melt <- crmat_melt[!is.na(crmat_melt$value),]

  #- PARENTS -------------------------------------

  if (withparent & (ncol(crmat)>1) & Levels[1]!=Levels[2]){

    # get index structure
    lin <- coin$Meta$Lineage
    # select cols corresponding to what is being correlated against what
    lin <- unique(lin[Levels])
    # rename corr matrix cols to prepare for join
    colnames(crmat_melt) <- c(colnames(lin), "Correlation")

    # now merge - we are matching correlation rows that agree with the structure of the index
    # essentially, we subset the rows of crmat_melt to only include the ones that agree with lin
    crtable <- merge(lin, crmat_melt, by = colnames(lin))

    # rename cols for plotting
    colnames(crtable)[1:2] <- c("Var1", "Var2")

  } else {

    crtable <- crmat_melt
    colnames(crtable) <- c("Var1", "Var2", "Correlation")

  }

  ##- GROUP ----------------------------------------
  # If correlating against the same level, only show within groups if asked

  if(!is.null(grouplev)){

    if(!make_long){
      warning("Grouping is not supported for make_long = FALSE. Set make_long = TRUE to group.")
    } else {

      if (Levels[1] == Levels[2]){

        # get index structure
        lin <- coin$Meta$Lineage

        if(grouplev <= Levels[1]){
          stop("grouplev must be at least the aggregation level above Levels.")
        }

        if(grouplev > ncol(lin)){
          stop("Grouping level is out of range - should be between 2 and ", ncol(lin), " or zero to turn off.")
        }

        # select cols corresponding to current level, plus the one above
        # remember here we are correlating a level against itself, so Levels[1]==Levels[2]
        lin <- lin[c(Levels[1], grouplev)]
        # get unique groups in level above
        lev2 <- unique(lin[[2]])

        # initialise df for recording entries of corr matrix to keep
        keeprows <- data.frame(Var1 = NA, Var2 = NA)

        # loop over the levels above
        for (lev2ii in lev2){
          # get child codes
          lev1 <- lin[lin[2]==lev2ii, 1]
          lev1 <- unique(unlist(lev1, use.names = FALSE)) # otherwise it is still a tibble, also remove dups
          # get all 2-way combos of these codes
          lev1pairs <- expand.grid(lev1, lev1)
          # add to df
          keeprows <- rbind(keeprows, lev1pairs)
        }
        # remove first row (dummy)
        keeprows <- keeprows[-1,]

        # rename corr matrix cols to prepare for join
        colnames(crmat_melt)[3] <- "Correlation"
        colnames(keeprows) <- colnames(crmat_melt)[1:2]

        # now do inner join - we are matching correlation rows that agree with the structure of the index
        crtable <- merge(keeprows, crmat_melt, by = colnames(keeprows))
        # sometimes this throws duplicates, so remove
        crtable <- unique(crtable)

      }

    }
  }

  colnames(crtable) <- c("Var1", "Var2", "Correlation")

  # widen or not
  if(!make_long){
    crtable <- widen(crtable)
  }

  crtable
}



#' P-values for correlations in a data frame or matrix
#'
#' This is a stripped down version of the "cor.mtest()" function from the "corrplot" package. It uses
#' the [stats::cor.test()] function to calculate pairwise p-values. Unlike the corrplot version, this
#' only calculates p-values, and not confidence intervals. Credit to corrplot for this code, I only
#' replicate it here to avoid depending on their package for a single function.
#'
#' @param X A numeric matrix or data frame
#' @param \dots Additional arguments passed to function [cor.test()], e.g. \code{conf.level = 0.95}.
#'
#' @importFrom stats cor.test
#'
#' @examples
#' # a matrix of random numbers, 3 cols
#' x <- matrix(runif(30), 10, 3)
#'
#' # get correlations between cols
#' cor(x)
#'
#' # get p values of correlations between cols
#' get_pvals(x)
#'
#' @return Matrix of p-values
#' @export
get_pvals = function(X, ...) {

  # convert to matrix, get number cols
  X = as.matrix(X)
  n = ncol(X)

  # prep matrix for p values
  p.X <- matrix(NA, n, n)
  diag(p.X) = 0

  # populate matrix
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {

      # get p val for pair
      # catch possibility of all NAs in one or both vectors
      if(all(is.na(X[,i])) | all(is.na(X[,j]))){
        p.X[i, j] <- p.X[j, i] <- NA
      } else {
        tmp = stats::cor.test(x = X[, i], y = X[, j], ...)
        p.X[i, j] = p.X[j, i] = tmp$p.value
      }

    }
  }

  # rename cols
  colnames(p.X) = rownames(p.X) = colnames(X)

  # output
  p.X
}


#' Cronbach's alpha
#'
#' Calculates Cronbach's alpha, a measure of statistical reliability. Cronbach's alpha is a simple measure
#' of "consistency" of a data set, where a high value implies higher reliability/consistency. The
#' selection of indicators via [get_data()] allows to calculate the measure on any group of
#' indicators or aggregates.
#'
#' This function simply returns Cronbach's alpha. If you want a lot more details on reliability, the 'psych' package has
#' a much more detailed analysis.
#'
#' This function replaces the now-defunct `getCronbach()` from COINr < v1.0.
#'
#' @param coin A coin or a data frame containing only numerical columns of data.
#' @param ... Further arguments passed to [get_data()], other than those explicitly specified here.
#' @param use Argument to pass to [stats::cor] to calculate the covariance matrix. Default `"pairwise.complete.obs"`.
#' @param dset The name of the data set to apply the function to, which should be accessible in `.$Data`.
#' @param iCodes Indicator codes to retrieve. If `NULL` (default), returns all iCodes found in
#' the selected data set. See [get_data()].
#' @param Level The level in the hierarchy to extract data from. See [get_data()].
#'
#' @importFrom stats cov
#'
#' @examples
#' # build example coin
#' coin <- build_example_coin(up_to = "new_coin", quietly = TRUE)
#'
#' # Cronbach's alpha for the "P2P" group
#' get_cronbach(coin, dset = "Raw", iCodes = "P2P", Level = 1)
#'
#' @return Cronbach alpha as a numerical value.
#'
#' @export
get_cronbach <- function(coin, dset, iCodes, Level, ..., use = "pairwise.complete.obs"){

  # get data
  iData <- get_data(coin, dset = dset, iCodes = iCodes, Level = Level, ...)

  # get only indicator cols
  iData <- extract_iData(coin, iData, "iData_")

  # get number of variables
  k = ncol(iData)

  # get covariance matrix
  cvtrix <- stats::cov(iData, use = use)

  # sum of all elements of cov matrix
  sigall <- sum(cvtrix, na.rm = TRUE)

  # mean of all elements except diagonal
  sigav <- (sigall - sum(diag(cvtrix), na.rm = TRUE))/(k*(k-1))

  # calculate Cronbach alpha
  (k^2 * sigav)/sigall

}