R/FeatureEffect-ale.R

Defines functions impute_cells calculate.ale.num.cat calculate.ale.cat calculate.ale.num.num calculate.ale.num

Documented in calculate.ale.cat calculate.ale.num calculate.ale.num.cat calculate.ale.num.num impute_cells

# for suppressing R CMD NOTE:
.SD <- .ale <- .ale0 <- .ale1 <- .ale2 <- .class <- .count <- .interval <- .interval1 <-
  .interval2 <- .level <- .num <- .y.hat <- .y.hat.cumsum <- .yhat.diff <- NULL


#' Compute ALE for 1 numerical feature
#'
#' @importFrom data.table melt as.data.table setnames
#' @param dat the data.frame with same columns as training data
#' @param run.prediction Predict function of type: f(newdata)
#' @param feature.name The column name of the feature for which to compute ALE
#' @param grid.dt data.table with single column with grid values for the numerical feature
#' @keywords internal
calculate.ale.num <- function(dat, run.prediction, feature.name, grid.dt) {
  # Matching data instances to intervals
  interval.index <- findInterval(dat[[feature.name]], grid.dt[[1]], left.open = TRUE)
  # Data point in the left most interval should be in interval 1, not zero
  interval.index[interval.index == 0] <- 1
  # Getting the predictions for instances of upper and lower interval limits
  X.lower <- X.upper <- dat
  X.lower[, feature.name] <- grid.dt[interval.index, ]
  X.upper[, feature.name] <- grid.dt[interval.index + 1, ]
  predictions.lower <- run.prediction(X.lower)
  predictions.upper <- run.prediction(X.upper)
  # finite differences
  prediction.deltas <- predictions.upper - predictions.lower
  deltas <- cbind(X.lower[, feature.name, with = FALSE], prediction.deltas, data.frame(".interval" = interval.index))
  y.hat.names <- as.data.table(setdiff(colnames(deltas), c(colnames(dat), ".interval")))
  y.hat.names = unlist(y.hat.names)
  deltas <- data.table::melt(deltas,
    variable.name = ".class",
    value.name = ".yhat.diff", measure.vars = y.hat.names
  )
  # average over instances within each interval
  setkeyv(deltas, c(".class", ".interval"))
  deltas <- deltas[, list(.yhat.diff = mean(.yhat.diff)), by = c(".interval", ".class")]
  # accumulate over the intervals
  deltas.accumulated <- deltas[, list(.y.hat.cumsum = cumsum_na(c(0, .yhat.diff))), by = ".class"]
  interval.n.dat <- as.numeric(table(interval.index))
  # the mean effect is the weighted mean of the interval mid point effects
  # weighted by the number of points in the interval
  fJ0 <- deltas.accumulated[, list(.ale0 = sum(((.y.hat.cumsum[1:(nrow(.SD) - 1)] +
    .y.hat.cumsum[2:nrow(.SD)]) / 2) * interval.n.dat) / sum(interval.n.dat)),
  by = ".class"
  ]
  # centering the ALEs
  deltas.accumulated <- merge(deltas.accumulated, fJ0, all.x = TRUE, by = ".class")
  fJ <- deltas.accumulated[, list(.value = .y.hat.cumsum - .ale0, .id = 1:nrow(.SD), .type = "ale"),
    by = ".class"
  ]
  grid.dt$.id <- 1:nrow(grid.dt)
  fJ <- merge(fJ, grid.dt, by = ".id")
  # adding the centering constant to the output
  # fJ0$.type = "ale0"
  # fJ0$.ale = fJ0$.ale0
  # fJ0$.ale0 = NULL
  # fJ = rbind(fJ, fJ0, fill = TRUE)
  fJ$.id <- NULL
  data.frame(fJ)
}


#' Compute ALE for 2 numerical features
#'
#' @param dat the data.frame with same columns as training data
#' @param run.prediction Predict function of type: f(newdata)
#' @param feature.name The column names of the feature for which to compute ALE
#' @param grid.dt1 Data.table with single column with the grid value for feature 1
#' @param grid.dt2 Data.table with single column with the grid value for feature 2
#' @keywords internal
calculate.ale.num.num <- function(dat, run.prediction, feature.name, grid.dt1, grid.dt2) {
  # Remove data outside of boundaries
  dat <- dat[(dat[[feature.name[1]]] <= max(grid.dt1[[1]])) &
             (dat[[feature.name[1]]] >= min(grid.dt1[[1]])) &
             (dat[[feature.name[2]]] <= max(grid.dt2[[1]])) &
             (dat[[feature.name[2]]] >= min(grid.dt2[[1]])),]
  # Matching instances to the grid of feature 1
  interval.index1 <- findInterval(dat[[feature.name[1]]], grid.dt1[[1]], left.open = TRUE)
  # Data point in the left most interval should be in interval 1, not zero
  interval.index1[interval.index1 == 0] <- 1
  # Matching instances to the grid of feature 2
  interval.index2 <- findInterval(dat[[feature.name[2]]], grid.dt2[[1]], left.open = TRUE)
  # Data point in the left most interval should be in interval 1, not zero
  interval.index2[interval.index2 == 0] <- 1
  # Preparation for predictions of cell corners
  X.low1.low2 <- X.up1.low2 <- X.low1.up2 <- X.up1.up2 <- copy(dat)
  X.low1.low2[, feature.name] <- data.table(grid.dt1[interval.index1, ], grid.dt2[interval.index2, ])
  X.up1.low2[, feature.name] <- data.table(grid.dt1[interval.index1 + 1, ], grid.dt2[interval.index2, ])
  X.low1.up2[, feature.name] <- data.table(grid.dt1[interval.index1, ], grid.dt2[interval.index2 + 1, ])
  X.up1.up2[, feature.name] <- data.table(grid.dt1[interval.index1 + 1, ], grid.dt2[interval.index2 + 1, ])
  # Getting all predictions from the model
  predictions.11 <- run.prediction(X.low1.low2)
  predictions.21 <- run.prediction(X.up1.low2)
  predictions.12 <- run.prediction(X.low1.up2)
  predictions.22 <- run.prediction(X.up1.up2)
  # Second order differences per cell and instance
  # (top right corner - top left corner) - (bottom right corner  - bottom left corner)
  predictions <- (predictions.22 - predictions.21) - (predictions.12 - predictions.11)
  deltas <- cbind(dat[, feature.name, with = FALSE], predictions, data.frame(".interval1" = interval.index1, ".interval2" = interval.index2))
  y.hat.names <- setdiff(colnames(deltas), c(colnames(dat), c(".interval1", ".interval2")))
  # instead of a matrix, we work with melted version of the data
  # This allows us to work with multi-dimensional predictions
  deltas <- data.table::melt(deltas,
    variable.name = ".class",
    value.name = ".y.hat", measure.vars = y.hat.names
  )
  # Make sure all intervals are included
  interval_grid <- expand.grid(
    .interval1 = unique(deltas$.interval1), .interval2 = unique(deltas$.interval2),
    .class = unique(deltas$.class)
  )
  deltas <- merge(deltas, interval_grid, on = c(".interval1", ".interval2", ".class"), all.y = TRUE)
  # Average over all instances within a cell
  setkeyv(deltas, c(".class", ".interval1", ".interval2"))
  deltas <- deltas[, list(.yhat.diff = mean(.y.hat)), by = c(".interval1", ".interval2", ".class")]


  # fill empty cells with the closest neighbour cells value
  # remember cell status for later
  deltas.na.cell <- copy(deltas)
  deltas.na.cell$missing <- is.na(deltas.na.cell$.yhat.diff)
  deltas.na.cell$.yhat.diff <- NULL
  # replace the missing ones with the closest non-missing difference (measured in number of intervals)
  deltas <- rbindlist(future.apply::future_lapply(unique(deltas$.class), function(cl) {
    impute_cells(deltas[.class == cl, ],
      grid1 = grid.dt1, grid2 = grid.dt2,
      x1.ind = ".interval1", x2.ind = ".interval2"
    )
  }))

  # Acumulate the predictions from bottom left to top right
  ale <- deltas[, list(.y.hat.cumsum = cumsum_na(c(0, .yhat.diff)), .interval2 = c(0, .interval2)), by = c(".class", ".interval1")]
  ale <- ale[, list(.y.hat.cumsum = cumsum_na(c(0, .y.hat.cumsum)), .interval1 = c(0, .interval1)), by = c(".class", ".interval2")]
  # Number of cells are need for weighting later
  cell.counts <- as.matrix(table(interval.index1, interval.index2))
  cell.counts.m <- data.table::melt(as.data.table(cell.counts), measure.vars = "N")
  cell.counts.m$variable <- NULL
  colnames(cell.counts.m) <- c(".interval1", ".interval2", ".count")
  cell.counts.m$.interval1 <- as.numeric(as.character(cell.counts.m$.interval1))
  cell.counts.m$.interval2 <- as.numeric(as.character(cell.counts.m$.interval2))
  ale <- merge(ale, cell.counts.m, on = c(".interval1", ".interval2"), all.x = TRUE)

  # Computing the first-order effect of feature 1
  # First, take the differences across feature 1
  ale1 <- ale[, list(.yhat.diff = .y.hat.cumsum[2:nrow(.SD)] - .y.hat.cumsum[1:(nrow(.SD) - 1)], .interval1 = .interval1[2:(nrow(.SD))], .count = .count[2:(nrow(.SD))]),
    by = c(".class", ".interval2")
  ]
  # Then take the prediction at the mid point of each interval, which is the mean of the prediction at the end points
  #     and take calculate the mean, weighted by the number of data instances per cell
  ale1 <- ale1[, list(.ale1 = sum(.count[2:nrow(.SD)] * (.yhat.diff[1:(nrow(.SD) - 1)] +
    .yhat.diff[2:(nrow(.SD))]) / 2) / sum(.count[2:nrow(.SD)])), by = c(".class", ".interval1")]
  ale1 <- ale1[, list(.ale1 = c(0, cumsum_na(.ale1)), .interval1 = c(0, .interval1)), by = c(".class")]
  # Computing the first-order effect of feature 2
  # First, take the differences across feature 2
  ale2 <- ale[, list(.yhat.diff = .y.hat.cumsum[2:nrow(.SD)] - .y.hat.cumsum[1:(nrow(.SD) - 1)], .interval2 = .interval2[2:(nrow(.SD))], .count = .count[2:(nrow(.SD))]),
    by = c(".class", ".interval1")
  ]
  # Then take the prediction at the mid point of each interval, which is the mean of the prediction at the end points
  #     and take calculate the mean, weighted by the number of data instances per cell
  ale2 <- ale2[, list(.ale2 = sum(.count[2:nrow(.SD)] * (.yhat.diff[1:(nrow(.SD) - 1)] +
    .yhat.diff[2:(nrow(.SD))]) / 2) / sum(.count[2:nrow(.SD)])), by = c(".class", ".interval2")]
  ale2 <- ale2[, list(.ale2 = c(0, cumsum_na(.ale2)), .interval2 = c(0, .interval2)), by = ".class"]
  # for each cells computes the average prediction through mean of the cell corners
  # then again a mean over all cells, weighted by the number of data points in the cell
  cls <- unique(ale$.class)
  fJ0 <- unlist(lapply(cls, function(cl) {
    ale.cl <- ale[.class == cl, ]
    ale1.cl <- ale1[.class == cl, ]
    ale2.cl <- ale2[.class == cl, ]
    dd <- data.table::dcast(ale.cl, .interval1 ~ .interval2, value.var = ".y.hat.cumsum", drop = FALSE)[, -1]
    dd <- dd - outer(ale1.cl$.ale1, rep(1, nrow(ale2.cl))) - outer(rep(1, nrow(ale1.cl)), ale2.cl$.ale2)
    sum(cell.counts * (dd[1:(nrow(dd) - 1), 1:(ncol(dd) - 1)] + dd[1:(nrow(dd) - 1), 2:ncol(dd)] + dd[2:nrow(dd), 1:(ncol(dd) - 1)] + dd[2:nrow(dd), 2:ncol(dd)]) / 4, na.rm = TRUE) / sum(cell.counts)
  }))
  fJ0 <- data.frame(".fJ0" = fJ0, .class = cls)
  ale <- merge(ale, fJ0, by = c(".class"))
  ale <- merge(ale, ale1, by = c(".class", ".interval1"))
  ale <- merge(ale, ale2, by = c(".class", ".interval2"))
  ale$.ale <- ale$.y.hat.cumsum - ale$.ale1 - ale$.ale2 - ale$.fJ0
  # For later plotting, define the rectangles
  # These are not the same as the cells, which is a bit counterintuitive
  # each value in fJ is where the cells cross
  # instead of coloring the cells, we color a rectangle around each cell cross point
  # and for this we need to compute rectangles around these cross points
  # in image() this happens automatically (see ALEPlot::ALEPlot)
  # for the edges, simply use the grid value as the outer values
  interval.dists <- diff(grid.dt1[c(1, 1:nrow(grid.dt1), nrow(grid.dt1))][[1]])
  interval.dists <- 0.5 * interval.dists
  ale$.right <- grid.dt1[ale$.interval1 + 1, ] + interval.dists[ale$.interval1 + 2]
  ale$.left <- grid.dt1[ale$.interval1 + 1, ] - interval.dists[ale$.interval1 + 1]
  interval.dists2 <- diff(grid.dt2[c(1, 1:nrow(grid.dt2), nrow(grid.dt2))][[1]])
  interval.dists2 <- 0.5 * interval.dists2
  ale$.bottom <- grid.dt2[ale$.interval2 + 1, ] + interval.dists2[ale$.interval2 + 2]
  ale$.top <- grid.dt2[ale$.interval2 + 1, ] - interval.dists2[ale$.interval2 + 1]
  ale[, feature.name[1]] <- grid.dt1[ale$.interval1 + 1, ]
  ale[, feature.name[2]] <- grid.dt2[ale$.interval2 + 1, ]

  ale <- ale[, setdiff(colnames(ale), c(
    ".fJ0", ".ale1", ".ale2", ".y.hat.cumsum", ".count",
    ".interval1", ".interval2"
  )), with = FALSE]
  ale$.type <- "ale"

  data.frame(ale)
}


#' Compute ALE for 1 categorical feature
#' @param dat the data.frame with same columns as training data
#' @param run.prediction Predict function of type: f(newdata)
#' @param feature.name The column name of the feature for which to compute ALE
#' @keywords internal
calculate.ale.cat <- function(dat, run.prediction, feature.name) {
  x <- dat[, feature.name, with = FALSE][[1]]

  levels.original <- levels(droplevels(x))
  nlev <- nlevels(droplevels(x))
  # if ordered, than already use that
  if (inherits(x, "ordered")) {
    level_order <- 1:nlev
  } else {
    # reorders according to the distance of the levels in the other features
    level_order <- order_levels(dat, feature.name)
  }
  # The new order of the levels
  levels.ordered <- levels.original[level_order]
  # The feature with the levels in the new order
  x.ordered <- order(level_order)[as.numeric(droplevels(x))]
  X.lower <- X.upper <- dat

  # We only want to increase/decrease the ones that are not already the minimum or maximum level
  row.ind.increase <- (1:nrow(dat))[x.ordered < nlev]
  row.ind.decrease <- (1:nrow(dat))[x.ordered > 1]


  # increase / decrease the level where not at minium or maxium already
  X.lower[row.ind.decrease, feature.name] <- levels.ordered[x.ordered[row.ind.decrease] - 1]
  X.upper[row.ind.increase, feature.name] <- levels.ordered[x.ordered[row.ind.increase] + 1]
  y.hat <- run.prediction(dat)
  y.hat.increase <- run.prediction(X.upper[row.ind.increase, ])
  y.hat.decrease <- run.prediction(X.lower[row.ind.decrease, ])

  # To measure the difference or 'jump' in prediction between categories, we come from both directions:
  # We measure the difference in prediction when we increase category k to k+1 for instances with category k
  # We also measure the difference in prediction when we decrease category k+1 to k for instance with category k+1
  d.increase <- y.hat.increase - y.hat[row.ind.increase, ]
  d.decrease <- y.hat[row.ind.decrease, ] - y.hat.decrease
  # Compute the differences and the ALE
  deltas <- rbind(d.increase, d.decrease)
  deltas <- as.data.table(cbind(deltas, ".level.jump" = c(x.ordered[row.ind.increase], x.ordered[row.ind.decrease] - 1)))
  y.hat.names <- colnames(y.hat)
  deltas <- data.table(data.table::melt(deltas,
    id.vars = c(".level.jump"),
    value.name = ".yhat.diff",
    variable.name = ".class"
  ))
  # All those difference are aggregated (averaged) grouped by the jump between levels (and by .class for multi output)
  setkeyv(deltas, ".level.jump")
  deltas <- deltas[, list(.yhat.diff = mean(.yhat.diff)), by = c(".class", ".level.jump")]
  # We accumulate the jumps, starting at 0 for the first category
  deltas <- deltas[, list(.ale = c(0, cumsum(.yhat.diff))), by = c(".class")]
  # Results are centered by the mean interval effect, weighted by number of instances in the interval
  x.count <- as.numeric(table(x))
  x.prob <- x.count / sum(x.count)
  deltas <- deltas[, list(.ale = .ale - sum(.ale * x.prob[level_order]), .level = levels.ordered), by = ".class"]
  colnames(deltas) <- c(".class", ".value", feature.name)
  deltas[, feature.name] <- factor(deltas[, feature.name, with = FALSE][[1]], levels = levels.ordered)
  deltas$.type <- "ale"
  # make sure the rows are ordered by the new level order
  data.frame(deltas[order(deltas[[feature.name]]), ])
}


#' Compute ALE for 2 features, one numerical, one categorical
#'
#' @param dat the data.frame with same columns as training data
#' @param run.prediction Predict function of type: f(newdata)
#' @param feature.name The column name of the features for which to compute ALE
#' @param grid.dt data.table with single column with grid values for the numerical feature
#' @keywords internal
calculate.ale.num.cat <- function(dat, run.prediction, feature.name, grid.dt) {

  # Figure out which feature is numeric and which categeorical
  x.num.index <- ifelse(inherits(dat[, feature.name, with = FALSE][[1]], "numeric"), 1, 2)
  x.num <- dat[, feature.name[x.num.index], with = FALSE][[1]]
  # We can only compute ALE within min and max boundaries of given intervals
  # This part is only relevat for user-defined intervals
  dat <- dat[which((x.num >= min(grid.dt[[1]])) &
                   (x.num <= max(grid.dt[[1]])))]

  x.cat.index <- setdiff(c(1, 2), x.num.index)
  x.cat <- dat[, feature.name[x.cat.index], with = FALSE][[1]]
  levels.original <- levels(x.cat)
  # if ordered, than already use that
  if (inherits(x.cat, "ordered")) {
    level_order <- 1:nlevels(x.cat)
  } else {
    # reorders according to the distance of the levels in the other features
    level_order <- order_levels(dat, feature.name[x.cat.index])
  }
  # The new order of the levels
  levels.ordered <- levels.original[level_order]
  # The feature with the levels in the new order
  x.cat.ordered <- order(level_order)[as.numeric(x.cat)]
  # The rows for which the category can be increased
  row.ind.increase <- (1:nrow(dat))[x.cat.ordered < nlevels(x.cat)]
  row.ind.decrease <- (1:nrow(dat))[x.cat.ordered > 1]
  
  interval.index <- findInterval(dat[[feature.name[x.num.index]]], grid.dt[[1]], left.open = TRUE)
  # Data point in the left most interval should be in interval 1, not zero
  interval.index[interval.index == 0] <- 1


  # Since a categorical feature is involved, we do the second-order differences twice
  # Once for the instances that jump from k to k+1 and once for those that jump from category k+1 to k
  X.low1.low2 <- X.up1.low2 <- X.low1.up2 <- X.up1.up2 <- copy(dat)

  # put category as first dimension without loss of generality
  X.low1.low2[row.ind.increase, feature.name[x.num.index]] <-
    grid.dt[interval.index, ][[1]][row.ind.increase]
  X.low1.up2[row.ind.increase, feature.name[x.num.index]] <-
    grid.dt[interval.index + 1, ][[1]][row.ind.increase]
  X.up1.low2[row.ind.increase, feature.name[x.cat.index]] <-
    levels.ordered[x.cat.ordered[row.ind.increase] + 1]
  X.up1.low2[row.ind.increase, feature.name[x.num.index]] <-
    grid.dt[interval.index, ][[1]][row.ind.increase]
  X.up1.up2[row.ind.increase, feature.name[x.cat.index]] <-
    levels.ordered[x.cat.ordered[row.ind.increase] + 1]
  X.up1.up2[row.ind.increase, feature.name[x.num.index]] <-
    grid.dt[interval.index + 1, 1][[1]][row.ind.increase]


  # Getting all predictions from the model
  predictions.11 <- run.prediction(X.low1.low2[row.ind.increase, ])
  predictions.21 <- run.prediction(X.up1.low2[row.ind.increase, ])
  predictions.12 <- run.prediction(X.low1.up2[row.ind.increase, ])
  predictions.22 <- run.prediction(X.up1.up2[row.ind.increase, ])

  # second-order differences for instances jumping from category k to k+1
  d.increase <- (predictions.22 - predictions.21) - (predictions.12 - predictions.11)

  X.low1.low2 <- X.up1.low2 <- X.low1.up2 <- X.up1.up2 <- copy(dat)

  # put category as first dimension without loss of generality
  X.low1.low2[row.ind.decrease, feature.name[x.num.index]] <-
    grid.dt[interval.index, ][[1]][row.ind.decrease]
  X.low1.low2[row.ind.decrease, feature.name[x.cat.index]] <-
    levels.ordered[x.cat.ordered[row.ind.decrease] - 1]
  X.low1.up2[row.ind.decrease, feature.name[x.cat.index]] <-
    levels.ordered[x.cat.ordered[row.ind.decrease] - 1]
  X.low1.up2[row.ind.decrease, feature.name[x.num.index]] <-
    grid.dt[interval.index + 1, ][[1]][row.ind.decrease]
  X.up1.low2[row.ind.decrease, feature.name[x.num.index]] <-
    grid.dt[interval.index, ][[1]][row.ind.decrease]
  X.up1.up2[row.ind.decrease, feature.name[x.num.index]] <-
    grid.dt[interval.index + 1, 1][[1]][row.ind.decrease]


  # Getting all predictions from the model
  predictions.11 <- run.prediction(X.low1.low2[row.ind.decrease, ])
  predictions.21 <- run.prediction(X.up1.low2[row.ind.decrease, ])
  predictions.12 <- run.prediction(X.low1.up2[row.ind.decrease, ])
  predictions.22 <- run.prediction(X.up1.up2[row.ind.decrease, ])

  # second-order differences for instances jumping from category k+1 to k
  d.decrease <- (predictions.22 - predictions.21) - (predictions.12 - predictions.11)


  # Combine the deltas over k to k+1 and k+1 to k to the same jump
  deltas <- rbind(d.increase, d.decrease)
  deltas <- cbind(deltas,
    .level = c(x.cat.ordered[row.ind.increase], x.cat.ordered[row.ind.decrease] - 1),
    .num = interval.index[c(row.ind.increase, row.ind.decrease)]
  )
  y.hat.names <- colnames(predictions.22)
  deltas <- as.data.table(deltas)
  deltas <- data.table(data.table::melt(deltas, measure.vars = y.hat.names, variable.name = ".class", value.name = ".yhat.diff"))
  # add empty cells
  interval_grid <- expand.grid(
    .level = unique(deltas$.level), .num = unique(deltas$.num),
    .class = unique(deltas$.class)
  )
  deltas <- merge(deltas, interval_grid, on = c(".level", ".num", ".class"), all.y = TRUE)
  # Average results per cell
  setkeyv(deltas, c(".class", ".level", ".num"))
  deltas <- deltas[, list(.yhat.diff = mean(.yhat.diff)), by = c(".class", ".level", ".num")]

  # fill empty cells with the closest neighbour cells value
  # remember cell status for later
  deltas.na.cell <- copy(deltas)
  deltas.na.cell$missing <- is.na(deltas$.yhat.diff)
  deltas.na.cell$.yhat.diff <- NULL
  # replace the missing ones with the closest non-missing difference (measured in number of intervals)
  deltas <- rbindlist(future.apply::future_lapply(unique(deltas$.class), function(cl) {
    impute_cells(deltas[.class == cl, ],
      grid2 = grid.dt,
      x1.ind = ".level", x2.ind = ".num"
    )
  }))
  # Acumulate the predictions from bottom left to top right
  deltas <- deltas[, list(.ale = cumsum(c(0, .yhat.diff)), .num = c(0, .num)), by = c(".class", ".level")]
  deltas <- deltas[, list(.ale = cumsum(c(0, .ale)), .level = c(0, .level)), by = c(".class", ".num")]

  # Number of cells are need for weighting later
  cell.counts <- table(x.cat.ordered, interval.index)
  cell.counts.m <- as.data.table(cell.counts)
  data.table::setnames(cell.counts.m, "N", "value")
  colnames(cell.counts.m) <- c(".level", ".num", ".count")
  cell.counts.m$.level <- as.numeric(as.character(cell.counts.m$.level)) - 1
  cell.counts.m$.num <- as.numeric(as.character(cell.counts.m$.num))
  deltas <- merge(deltas, cell.counts.m, on = c(".level", ".num"), all.x = TRUE)
  deltas$.count[is.na(deltas$.count)] <- 0

  # Computing the first-order effect of feature 2, the numerical feature.
  # First, take the differences across feature 2
  deltas2 <- deltas[, list(.ale2 = .ale[2:nrow(.SD)] - .ale[1:(nrow(.SD) - 1)], .num = .num[2:(nrow(.SD))], .count = .count[2:(nrow(.SD))]),
    by = c(".class", ".level")
  ]
  # Then take the weighted average over the categories to get the effect at each numerical grid point.
  deltas2 <- deltas2[, list(.ale2 = sum(.count * .ale2) / sum(.count)), by = c(".class", ".num")]
  ale2 <- deltas2[, list(.ale2 = c(0, cumsum(.ale2)), .num = c(0, .num)), by = c(".class")]

  # Computing the first-order effect of feature 1, the categorical feature.
  # take the differences
  deltas1 <- deltas[, list(
    .ale1 = .ale[2:nrow(.SD)] - .ale[1:(nrow(.SD) - 1)], .level = .level[2:(nrow(.SD))],
    .count = (.count[2:nrow(.SD)] + .count[1:(nrow(.SD) - 1)]) / 2
  ), by = c(".class", ".num")]
  # mid points between numerical intervals
  deltas1 <- deltas1[, list(
    .ale1 = (.ale1[2:nrow(.SD)] + .ale1[1:(nrow(.SD) - 1)]) / 2,
    .count = .count[2:nrow(.SD)],
    .num = .num[2:nrow(.SD)]
  ), by = c(".class", ".level")]
  deltas1 <- deltas1[, list(.ale1 = sum(.ale1 * .count) / sum(.count)), by = c(".class", ".level")]
  ale1 <- deltas1[, list(.ale1 = c(0, cumsum_na(.ale1)), .level = c(0, .level)), by = c(".class")]

  cls <- unique(deltas$.class)
  fJ0 <- unlist(lapply(cls, function(cl) {
    deltas.cl <- deltas[.class == cl, ]
    ale1.cl <- ale1[.class == cl, ]
    ale2.cl <- ale2[.class == cl, ]
    dd <- as.matrix(data.table::dcast(deltas.cl, .level ~ .num, value.var = ".ale", drop = FALSE))[, -1]
    dd <- dd - outer(ale1.cl$.ale1, rep(1, nrow(ale2.cl))) - outer(rep(1, nrow(ale1.cl)), ale2.cl$.ale2)
    sum(cell.counts * (dd[, 1:(ncol(dd) - 1)] + dd[, 2:ncol(dd)]) / 2, na.rm = TRUE) / sum(cell.counts)
  }))

  fJ0 <- data.frame(".fJ0" = fJ0, ".class" = cls)
  deltas <- merge(deltas, fJ0, by = c(".class"))
  deltas <- merge(deltas, ale1, by = c(".class", ".level"))
  deltas <- merge(deltas, ale2, by = c(".class", ".num"))
  # remove first-order ALEs and center globally
  deltas$.ale <- deltas$.ale - deltas$.ale1 - deltas$.ale2 - deltas$.fJ0

  # For later plotting, define the rectangles
  # These are not the same as the cells, which is a bit counterintuitive
  # each value in fJ is where the cells cross
  # instead of coloring the cells, we color a rectangle around each cell cross point
  # and for this we need to compute rectangles around these cross points
  # in image() this happens automatically (see ALEPlot::ALEPlot)
  # for the edges, simply use the grid value as the outer values
  deltas$.right <- 1 + deltas$.level + 0.5
  deltas$.left <- 1 + deltas$.level - 0.5
  interval.dists2 <- diff(grid.dt[c(1, 1:nrow(grid.dt), nrow(grid.dt))][[1]])
  interval.dists2 <- 0.5 * interval.dists2
  deltas$.bottom <- grid.dt[deltas$.num + 1, ] + interval.dists2[deltas$.num + 2]
  deltas$.top <- grid.dt[deltas$.num + 1, ] - interval.dists2[deltas$.num + 1]
  deltas[, feature.name[x.num.index]] <- grid.dt[deltas$.num + 1, ]
  deltas[, feature.name[x.cat.index]] <- factor(levels.ordered[deltas$.level + 1], levels = levels.ordered)

  deltas <- deltas[, setdiff(colnames(deltas), c(
    ".fJ0", ".ale1", ".ale2", ".count",
    ".level", ".num"
  )), with = FALSE]
  deltas$.type <- "ale"
  data.frame(deltas)
}



#' Impute missing cells of grid
#'
#' by default assumes first column of cell.dat is x1 and second is x2
#' leave grid1 NULL if feature x1 is a factor
#' the difference variable has to be named .yhat.diff
#'
#' @param cell.dat data.table with at least 4 columns: .yhat.diff and the two interval indices.
#' Make sure that empty cells are also included and cell.dat is not the sparse representation.
#' @param grid1 data.frame where each row is the actual value for a given interval index for feature 1.
#' If empty impute_cells  assumes that the feature is categorical (factor).
#' @param grid2 data.frame where each row is the actual value for a given interval index for feature 2
#' @param x1.ind column number or name of cell.dat for feature 1. If one feature is categorical, has to be x1
#' @param x2.ind column number or name of cell.dat for feature 2
#' @keywords internal
impute_cells <- function(cell.dat, grid1 = NULL, grid2, x1.ind = 1, x2.ind = 2) {
  assert_data_table(cell.dat)
  assert_data_table(grid1, null.ok = TRUE)
  assert_data_table(grid2)

  if (!requireNamespace("yaImpute")) {
    stop("Please install package yaImpute")
  }

  # Making sure cell.dat contains all possible cells
  stopifnot(nrow(cell.dat) == length(unique(cell.dat[, x1.ind, with = FALSE][[1]])) * length(unique(cell.dat[, x2.ind, with = FALSE][[1]])))
  d.miss.ind <- is.na(cell.dat$.yhat.diff)

  # We don't have to impute anything when all cells are missing.
  if (!any(d.miss.ind)) {
    return(cell.dat)
  }

  # Distinguishes for normalization between categorical and numerical feature x1
  if (is.null(grid1)) {
    # For categorical feature, the range is the number of levels, and
    # scaled to range 1/n.categories to 1
    range.x1 <- length(unique(cell.dat[[x1.ind]]))
    x1.normalized <- cell.dat[[x1.ind]] / range.x1
  } else {
    # For numerical feature range is from min to max
    # normalizing means taking the mean of neighbours and dividing by range
    range.x1 <- max(grid1[[1]]) - min(grid1[[1]])
    x1.normalized <- (grid1[cell.dat[[x1.ind]], ][[1]] + grid1[cell.dat[[x1.ind]] + 1, ][[1]]) / (2 * range.x1)
  }
  # same for the second numerical feature
  range.x2 <- max(grid2[[1]]) - min(grid2[[1]])
  x2.normalized <- (grid2[cell.dat[[x2.ind]], ][[1]] + grid2[cell.dat[[x2.ind]] + 1, ][[1]]) / (2 * range.x2)
  # preparation for yaImpute::ann function
  z.na <- cbind(x1.normalized[d.miss.ind], x2.normalized[d.miss.ind])
  z.non.na <- cbind(x1.normalized[!d.miss.ind], x2.normalized[!d.miss.ind])
  # matches closest non-missing neighbour
  nbrs <- yaImpute::ann(as.matrix(z.non.na), as.matrix(z.na), k = 1, verbose = FALSE)$knnIndexDist[, 1]
  cell.dat[d.miss.ind, ".yhat.diff"] <- cell.dat[which(!d.miss.ind)[nbrs], .yhat.diff]
  cell.dat
}

Try the iml package in your browser

Any scripts or data that you put into this service are public.

iml documentation built on May 29, 2024, 1:59 a.m.