sgsR: Structurally Guided Sampling

Documented in calculate_lhsOpt

#' Analyze optimal Latin hypercube sample number
#'
#' @description Population level analysis of metric raster data to determine optimal Latin Hypercube sample size
#' @family analyze functions
#'
#' @inheritParams calculate_pop
#'
#' @param mats List. Output from \code{\link{calculate_pop}} function.
#' @param KLdiv Logical. Perform Kullback–Leibler divergence testing.
#' @param quant Logical. Perform quantile comparison testing.
#' @param minSamp Numeric. Minimum sample size to test. \code{default = 10}.
#' @param maxSamp Numeric. Maximum sample size to test. \code{default = 100}.
#' @param step Numeric. Sample step size for each iteration. \code{default = 10}.
#' @param rep Numeric. Internal repetitions for each sample size. \code{default = 10}.
#' @param iter Positive Numeric. The number of iterations for the Metropolis-Hastings
#' annealing process. Defaults to \code{10000}. Internal to \code{\link[clhs]{clhs}}.
#'
#' @references
#' Malone BP, Minasny B, Brungard C. 2019. Some methods to improve the utility of conditioned Latin hypercube sampling. PeerJ 7:e6451 DOI 10.7717/peerj.6451
#'
#' @return data.frame with summary statistics.
#'
#' @examples
#' \dontrun{
#' #--- Load raster and access files ---#
#' r <- system.file("extdata", "mraster.tif", package = "sgsR")
#' mr <- terra::rast(r)
#'
#' #--- calculate lhsPop details ---#
#' mats <- calculate_pop(mraster = mr)
#'
#' calculate_lhsOpt(mats = mats)
#'
#' calculate_lhsOpt(
#'   mats = mats,
#'   PCA = FALSE,
#'   iter = 200
#' )
#' }
#'
#' @note
#' Special thanks to Dr. Brendan Malone for the original implementation of this algorithm.
#'
#' @author Tristan R.H. Goodbody
#'
#' @export


calculate_lhsOpt <- function(mats,
                             PCA = TRUE,
                             quant = TRUE,
                             KLdiv = TRUE,
                             minSamp = 10,
                             maxSamp = 100,
                             step = 10,
                             rep = 10,
                             iter = 10000) {
  #--- Set global vars ---#
  x <- y <- NULL

  #--- Error handling ---#

  if (!is.list(mats)) {
    stop("'mats' must be a list - see output from sgsR::calculate_pop()")
  }

  if (any(!names(mats) %in% c("values", "pcaLoad", "matQ", "matCov"))) {
    stop(paste0("'mats' must be the output from the 'sgsR::calculate_pop()' function"))
  }

  if (!is.logical(PCA)) {
    stop("'PCA' must be type logical")
  }

  if (!is.logical(quant)) {
    stop("'quant' must be type logical")
  }

  if (!is.logical(KLdiv)) {
    stop("'KLdiv' must be type logical")
  }

  if (!is.numeric(minSamp)) {
    stop("'minSamp' must be type numeric")
  }

  if (!is.numeric(maxSamp)) {
    stop("'maxSamp' must be type numeric")
  }

  if (!is.numeric(step)) {
    stop("'step' must be type numeric")
  }

  nb <- ncol(mats$values)

  #--- Establish sampling sequence ---#

  sampSeq <- seq(minSamp, maxSamp, step)

  matSeq <- matrix(NA, nrow = length(sampSeq), ncol = 6)

  #--- Apply functions for each potential sample size ---#

  for (tSamp in 1:length(sampSeq)) {
    #--- matrix holding iteration outputs ---#

    matFinal <- matrix(NA, nrow = rep, ncol = 7)

    #--- run conditional latin hypercube sampling ---#

    for (j in 1:rep) {
      #--- perform conditionel Latin Hypercube Sampling ---#

      ss <- clhs::clhs(mats$values, size = sampSeq[tSamp], progress = TRUE, iter = iter)

      samples <- mats$values[ss, ]

      # --- PCA similarity factor testing ---#

      if (isTRUE(PCA)) {
        if (any(!c("pcaLoad") %in% names(mats))) {
          stop("'mats' does not have PCA loadings. Use `calculate_pop(PCA = TRUE)`.", call. = FALSE)
        }

        #--- perform PCA analysis for the samples to determine variance in each component ---#

        pcaS <- stats::prcomp(samples, scale = TRUE, center = TRUE)

        #--- extract sample pca scores ---#

        pcaSScores <- as.data.frame(pcaS$x)

        #--- extract sample PCA loadings ---#

        pcaLoadSamp <- matrix(NA, ncol = nb, nrow = nb)

        for (i in 1:nb) {
          pcaLoadSamp[i, ] <- as.matrix(t(pcaS$rotation[i, ]))
        }

        #--- Perfrom the Krznowski 1979 calculation ---#

        pop <- mats$pcaLoad[, 1:2]
        samp <- pcaLoadSamp[, 1:2]

        #--- transpose matrices ---#

        popT <- t(pop)
        sampT <- t(samp)

        #--- matrix multiplication ---#

        S <- popT %*% samp %*% sampT %*% pop

        matFinal[j, 1] <- sum(diag(S)) / 2
      }

      #--- Comparison of quantiles ---#

      if (isTRUE(quant)) {
        for (var in 1:nb) {
          #--- Calculate sample quantiles ---#

          sampleQuant <- stats::quantile(samples[, var], probs = seq(0, 1, 0.25), names = F, type = 7)

          #--- Calculate population quantiles ---#

          popQuant <- stats::quantile(mats$values[, var], probs = seq(0, 1, 0.25), names = F, type = 7)

          #--- populate quantile differences into matFinal ---#

          matFinal[j, var + 1] <- sqrt((popQuant[1] - sampleQuant[1])^2 +
            (popQuant[2] - sampleQuant[2])^2 +
            (popQuant[3] - sampleQuant[3])^2 +
            (popQuant[4] - sampleQuant[4])^2)
        }

        #--- calculate mean distance from all variables calculated above ---#

        matFinal[j, 6] <- mean(matFinal[j, 2:sum((1 + nb))])
      }

      if (isTRUE(KLdiv)) {
        #--- calculate sample covariate matrix ---#

        sampleCov <- mat_cov(
          vals = samples,
          nQuant = nrow(mats$matCov),
          nb = nb,
          matQ = mats$matQ
        )

        #--- calculate KL divergence ---#

        #--- create empty vector for populating in loop ---#

        kld.vars <- c()

        #--- Generate KL divergence for each metric and populate kld.vars ---#

        for (kl in 1:nb) {
          #--- calculate divergence - if values are 0 set to very small number ---#

          sampleCov[which(sampleCov == 0)] <- 0.0000001

          kld <- entropy::KL.empirical(mats$matCov[, kl], sampleCov[, kl])

          #--- populate vector ---#

          kld.vars[kl] <- kld
        }

        #--- calculate mean divergence ---#
        #--- Divergence of 0 means samples do not diverge from pop ---#

        KLout <- mean(kld.vars)

        #--- populate to final matrix ---#

        matFinal[j, 7] <- KLout
      }
    }

    #--- create outputs for all tests ---#

    matSeq[tSamp, 1] <- mean(matFinal[, 6])
    matSeq[tSamp, 2] <- stats::sd(matFinal[, 6])
    matSeq[tSamp, 3] <- min(matFinal[, 1])
    matSeq[tSamp, 4] <- max(matFinal[, 1])
    matSeq[tSamp, 5] <- mean(matFinal[, 7])
    matSeq[tSamp, 6] <- stats::sd(matFinal[, 7])
  }

  dfFinal <- as.data.frame(cbind(sampSeq, matSeq))
  names(dfFinal) <- c("n", "mean_dist", "sd_dist", "min_S", "max_S", "mean_KL", "sd_KL")

  #--- plot the outputs and determine optimal sample size based on mean_KL divergence ---#

  plot_LHCOptim(
    dfFinal,
    maxSamp
  )

  return(dfFinal)
}


plot_LHCOptim <- function(dfFinal,
                          maxSamp) {
  #--- set global vars ---#

  df.x <- NULL

  #--- Create normalized ---#
  df <- data.frame(
    x = dfFinal[, 1],
    y = 1 - (dfFinal[, 6] - min(dfFinal[, 6])) / (max(dfFinal[, 6]) - min(dfFinal[, 6]))
  )

  #--- Parametise Exponential decay function ---#
  graphics::plot(df$x, df$y, xlab = "sample number", ylab = "1 - KL Divergence") # Initial plot of the data

  #--- Prepare a good inital state ---#
  theta.0 <- max(df$y) * 1.1
  model.0 <- stats::lm(log(-y + theta.0) ~ x, data = df)
  alpha.0 <- -exp(stats::coef(model.0)[1])
  beta.0 <- stats::coef(model.0)[2]

  start <- list(alpha = alpha.0, beta = beta.0, theta = theta.0)

  #--- Fit the model ---#
  model <- stats::nls(y ~ alpha * exp(beta * x) + theta, data = df, start = start)


  #--- add fitted curve ---#

  predicted <- stats::predict(model, list(x = df$x))

  graphics::plot(df$x, df$y, xlab = "# of samples", ylab = "norm mean KL divergence")
  graphics::lines(df$x, predicted, col = "skyblue", lwd = 3)

  x1 <- c(-1, maxSamp)
  y1 <- c(0.95, 0.95)
  graphics::lines(x1, y1, lwd = 2, col = "red")

  num <- data.frame(df$x, predicted) %>%
    dplyr::filter(abs(predicted - 0.95) == min(abs(predicted - 0.95))) %>%
    dplyr::select(df.x) %>%
    dplyr::pull()

  message(paste0("Your optimum estimated sample size based on KL divergence is: ", num))

  x2 <- c(num, num)
  y2 <- c(0, 1)
  graphics::lines(x2, y2, lwd = 2, col = "red")
}