SAuR: Sensitivity Analysis Using R

Documented in betaKS3_R mollify_R

#' Moving average data smoother.
#'
#' Computation of a moving average data smoother.
#'
#' @param x A numeric vector containing the data to analyze.
#' @param p A length-one numeric vector.
#' @return A \code{matrix} object.
#' @author Sergio Venturini \email{sergio.venturini@unito.it}
#' @seealso \code{\link{betaKS3_R}} for computing the sensitivity measures.
#' @references
#'   Venturini, S., Borgonovo, E. (2020), "Sensitivity Analysis Using 
#'   \code{R}: the \pkg{SAuR} Package", Technical report.
#' @examples
#' \dontrun{
#' data(simdata_sub, package = "SAuR")
#'
#' }
#'
#' @export
mollify_R <- function(x, p) {
  x_mat <- as.matrix(x)
  n <- nrow(x_mat)
  k <- ncol(x_mat)
  
  p <- as.integer(min(c(p, floor(n/2 - 1))))

  if (p > 1) {
    X <- cumsum(as.vector(t(x_mat)))
    X_idx_1 <- numeric(n)
    count <- 1
    for (i in p:(n + p - 1)) {
      X_idx_1[count] <- ifelse(i < n, i + 1, n)
      count <- count + 1
    }
    X_idx_2 <- numeric(n)
    count <- 1
    for (i in (-p):(n - p - 1)) {
      X_idx_2[count] <- ifelse(i > 0, i + 1, 1)
      count <- count + 1
    }
    s <- X[X_idx_1] - X[X_idx_2]
    s_den <- c((2*p - 1):(2*p), rep(2*p, n - 2 - 2), (2*p):(2*p - 1))
    s <- s/s_den
  }

  return(s)
}

#' Kolmogorov-Smirnov Sensitivity Using Smoothing CDF.
#'
#' Computation of the following sensitivity measures: Kolmogorov-Smirnov
#'   distance beta, Borgonovo's delta, Kuiper's discrepancy kappa, Pearson's
#'   correlation ratio eta2 and Wald-Wolfowitz number of runs (normalized),
#'   as well as the corresponding conditional shift/separation measurements.
#'
#' @param data An object of class \code{SAuR_data} containing the model's inputs
#' @param MP A numeric vector indicating the number of partition to use.
#' @param gfx A length-one character vector providing the plot main title
#' @param outcol A length-one numeric vector indicating the output column
#'    to focus on
#' @return A \code{list} object containing the sensitivity measures described above.
#' @author Sergio Venturini \email{sergio.venturini@unito.it}
#' @references
#'   Venturini, S., Borgonovo, E. (2020), "Sensitivity Analysis Using 
#'   \code{R}: the \pkg{SAuR} Package", Technical report.
#' @examples
#' \dontrun{
#' data(simdata_sub, package = "SAuR")
#'
#' }
#'
#' @export
betaKS3_R <- function(data, MP = NULL, gfx = NULL, outcol = NULL) {
  x <- data@X
  y <- data@Y
  if (!is.null(outcol))
    y <- y[, outcol, drop = FALSE]
  n <- nrow(x)
  k <- ncol(x)
  if (is.matrix(y)) {
    if (nrow(y) != n)
      stop("number of observations of y must match those of x.")
  } else {
    if (length(y) != n)
      stop("number of observations of y must match those of x.")
  }
  if (!is.null(dim(y)) & ncol(y) > 1)
    stop("the betaKS3_R() function computes the sensitivity measures for a single output.")
  if (!is.null(dim(y)))
    y <- as.numeric(y)

  check_rstudio <- try(rstudioapi::isAvailable(), silent = TRUE)

  plotdiff <- 1
  gfxrows <- ceiling(sqrt(k))
  nclrs <- 100  # [[TODO]]
  colors <- rainbow(nclrs)

  if (is.null(MP)) {
    MP <- c(32, 3)
  } else {
    MP <- as.matrix(MP)
    MP <- as.numeric(MP[nrow(MP), ])
  }
  if (length(MP) > 1) {
    M <- MP[1]
    P <- MP[length(MP)]
  } else {
    M <- MP[1]
    P <- 3
  }

  if (M == 0) {
    # test debiasing: predict value at partition size M = 0
    res1 <- betaKS3_R(data, MP = 10, outcol = outcol)
    res2 <- betaKS3_R(data, MP = 20, outcol = outcol) # double partition

    res <- list()
    res[["b"]] <- 2*(res1[["b"]] - .5*res2[["b"]])
    res[["d"]] <- 2*(res1[["d"]] - .5*res2[["d"]])
    res[["t"]] <- 2*(res1[["t"]] - .5*res2[["t"]])
    res[["e"]] <- 2*(res1[["e"]] - .5*res2[["e"]])
    res[["w"]] <- res[["bm"]] <- res[["dm"]] <- res[["km"]] <- res[["em"]] <- res[["wm"]] <- NULL

    return(res)
  }

  # cumulative distribution: with ties
  ys <- sort(unique(y))
  indx <- match(ys, y)
  rindx <- match(y, ys)
  
  nn <- length(ys) # get the number of unique sample values
  
  if (nn == n) {
    cdf  <- seq(1, n)/n     # if all samples unique, cdf is easy
  } else {
    rle <- numeric(nn)      # otherwise, build run length encoder
    for (i in rindx) {
      rle[i] <- rle[i] + 1  # count of each value
    }
    cdf <- cumsum(rle)/n
    cdf <- cdf[sort(rindx)] # compute probability at each location
    y_tmp <- sort.int(y, index.return = TRUE)
    ys <- y_tmp$x
    indx <-  y_tmp$ix
  }
  
  xs <- x[indx, ]
  n <- nrow(x)
  k <- ncol(x)

  VY <- var(y)*(length(y) - 1)/length(y) # [[SERGIO: shouldn't we use the sample variance?]]

  m <- seq(from = 0, to = 1, length.out = (M + 1))
  bm <- matrix(0, nrow = k, ncol = M)    # Kolmogorov
  dm <- matrix(0, nrow = k, ncol = M)    # L1 Borgonovo
  tm <- matrix(0, nrow = k, ncol = M)    # Kuiper
  wm <- matrix(0, nrow = k, ncol = M)    # Wald-Wolfowitz
  em <- matrix(VY/n, nrow = k, ncol = M) # conditional expectation
  nm <- matrix(0, nrow = k, ncol = M)
  
  # keep only values from the partition
  indxx <- apply(xs, 2, order)
  xr <- apply(xs, 2, sort)
  
  # test for constant input
  if (any((xr[1, ] == xr[nrow(xr), ]))) {
    stop("constant input factors detected.")
  }
  factorlist <- (1:ncol(xr))[xr[1, ] != xr[nrow(xr), ]]

  for (i in factorlist) {
    xr[indxx[, i], i] <- cumsum(c(1, (diff(xr[, i]) != 0))) # cheap tied ranks
  }
  xr <- xr/max(xr)  # scale to 1

  # prepare for plotting
  if (!is.null(gfx)) {
    if ((class(check_rstudio) != "try-error") & check_rstudio) {
      # do nothing --> the graph is automatically provided in a new plot window
    } else {
      dev.new()
    }
    split.screen(figs = c(gfxrows, ceiling(k/gfxrows)))
    for (i in factorlist) {
      screen(i)
      plot(range(y), c(-.75, .75), type = "n", main = paste0(gfx, " conditioning on x_", i),
        xlab = "y", ylab = ifelse(plotdiff, "Delta CDF", "CDF"),
        cex.lab = .7, cex.main = .7, cex.axis = .7)
    }
    clrs <- hcl(h = seq(120, 0, length = M) + 150)
  }

  for (j in 1:M) {
    print(paste0("step ", j, " of ", M))
    indx <- (m[j] < xr) & (xr <= m[j + 1])
    xtremew <- xor(indx[-1, ], indx[-n, ])
    xtreme <- indx
    cdfc <- apply(indx, 2, cumsum)
    nm[, j] <- cdfc[nrow(cdfc), ] # if no ties: always same number of realizations
    xtreme[1, ] <- xtreme[n, ] <- TRUE
    
    for (i in factorlist) {  # sum of conditionals
      scc <- nm[i, j]
      if (!scc) next
      # difference of conditionals
      smoothdcdf <- mollify_rcpp(cdf - cdfc[, i]/scc, ceiling(P*M))
      xxtrem <- xtreme[, i] | c(xtreme[-1, i], FALSE)
      critdcdf <- smoothdcdf[xxtrem]
      mx <- max(critdcdf)
      mn <- min(critdcdf)
      # Kolmogorov-Smirnov distance
      bm[i, j] <- max(c(0, mx, -mn))
      # Kuiper dicrepancy
      tm[i, j] <- max(c(0, mx - mn))
      #  find max in each positive run, min in each negative
      delta <- sgn <- exval <- 0
      if (!is.null(gfx) & plotdiff) {
        screen(n = i, new = FALSE)
        if (j == 1) erase.screen(n = i)
        plot(ys[xxtrem], critdcdf, col = clrs[j], cex = .3, axes = FALSE, xlab = "", ylab = "",
          xlim = range(y), ylim = c(-.75, .75), pch = 21)
      }

      for (val in critdcdf) {
        sg <- sign(val)
        if (sg == sgn) {
          exval <- max(c(exval, sg*val))
        } else {
          if (exval > 0.5*sqrt(1/n + 1/scc)) {
            delta <- delta + exval
          }
          sgn <- sg
          exval <- sg*val
        }
      }
      
      dm[i, j] <- delta + exval

      # and more indicators:
      #  - Wald-Wolfowitz number of runs
      mju <- 2*scc*(n - scc)/n - 1
      #  - +-1
      wm[i, j] = (mju - sum(xtremew[, i]) + 1)/sqrt(mju*(mju + 1)/(n - 1))
      #  - correlation ratio
      em[i, j] = var(ys[indx[, i]])*(length(ys[indx[, i]]) - 1)/length(ys[indx[, i]]) # [[SERGIO: shouldn't we use the
                                                                                      #   sample variance?]]
    }
  }

  if (!is.null(gfx)) close.screen(all.screens = TRUE)

  b <- rowSums(bm*nm)/n
  d <- rowSums(dm*nm)/n
  t <- rowSums(tm*nm)/n
  w <- rowSums(wm*nm)/n
  f <- 1 - (rowSums(em*nm)/n)/VY
  e <- numeric(k)
  e[factorlist] <- f[factorlist]
  
  res <- list(b = b, d = d, t = t, e = e, w = w, bm = bm, dm = dm, tm = tm, em = em, wm = wm)
  
  return(res)
}