R/calculate_model_drift.R
In drifter: Concept Drift and Concept Shift Detection for Predictive Models

#' Calculate Model Drift for comparison of models trained on new/old data
#'
#' This function calculates differences between PDP curves calculated for new/old models
#'
#' @param model_old model created on historical / `old`data
#' @param model_new model created on current / `new`data
#' @param data_new data frame with current / `new` data
#' @param predict_function function that takes two arguments: model and new data and returns numeric vector with predictions, by default it's `predict`
#' @param y_new true values of target variable for current / `new` data
#' @param max_obs if negative, them all observations are used for calculation of PDP, is positive, then only `max_obs` are used for calculation of PDP
#' @param scale scale parameter for calculation of scaled drift
#'
#' @return an object of a class `model_drift` (data.frame) with distances calculated based on Partial Dependency Plots
#' @importFrom dplyr filter group_by summarise
#' @importFrom tidyr spread
#' @importFrom stats predict sd
#' @importFrom ingredients partial_dependency
#' @importFrom DALEX explain
#' @export
#'
#' @examples
#'  library("DALEX")
#'  model_old <- lm(m2.price ~ ., data = apartments)
#'  model_new <- lm(m2.price ~ ., data = apartments_test[1:1000,])
#'  calculate_model_drift(model_old, model_new,
#'                   apartments_test[1:1000,],
#'                   apartments_test[1:1000,]$m2.price)
#'
#'  \donttest{
#'  library("ranger")
#'  predict_function <- function(m,x,...) predict(m, x, ...)$predictions
#'  model_old <- ranger(m2.price ~ ., data = apartments)
#'  model_new <- ranger(m2.price ~ ., data = apartments_test)
#'  calculate_model_drift(model_old, model_new,
#'                   apartments_test,
#'                   apartments_test$m2.price,
#'                   predict_function = predict_function)
#'
#'  # here we compare model created on male data
#'  # with model applied to female data
#'  # there is interaction with age, and it is detected here
#'  predict_function <- function(m,x,...) predict(m, x, ..., probability=TRUE)$predictions[,1]
#'  data_old = HR[HR$gender == "male", -1]
#'  data_new = HR[HR$gender == "female", -1]
#'  model_old <- ranger(status ~ ., data = data_old, probability=TRUE)
#'  model_new <- ranger(status ~ ., data = data_new, probability=TRUE)
#'  calculate_model_drift(model_old, model_new,
#'                   HR_test,
#'                   HR_test$status == "fired",
#'                   predict_function = predict_function)
#'
#'  # plot it
#'  library("ingredients")
#'  prof_old <- partial_dependency(model_old,
#'                                      data = data_new[1:500,],
#'                                      label = "model_old",
#'                                      predict_function = predict_function,
#'                                      grid_points = 101,
#'                                      variable_splits = NULL)
#'  prof_new <- partial_dependency(model_new,
#'                                      data = data_new[1:500,],
#'                                      label = "model_new",
#'                                      predict_function = predict_function,
#'                                      grid_points = 101,
#'                                      variable_splits = NULL)
#'  plot(prof_old, prof_new, color = "_label_")
#' }
#'
calculate_model_drift <- function(model_old, model_new,
                                  data_new,
                                  y_new,
                                  predict_function = predict,
                                  max_obs = 100,
                                  scale = sd(y_new, na.rm = TRUE)) {
  #
  # test of model structure
  if (max_obs <= 0) {
    max_obs = nrow(data_new)
  }

  explainer_old <- explain(model_old,
                           data = data_new,
                           label = "model_old",
                           predict_function = predict_function)
  explainer_new <- explain(model_old,
                           data = data_new,
                           label = "model_new",
                           predict_function = predict_function)

  prof_old <- partial_dependency(explainer_old, N = max_obs)
  prof_new <- partial_dependency(explainer_new, N = max_obs)
  # for all variables
  vars <- as.character(unique(prof_old$`_vname_`))

  df <- compare_two_profiles(prof_old, prof_new, vars, scale = scale)

  class(df) <- c("model_drift", "data.frame")
  df
}


#' Calculate Residual Drift for old model and new vs. old data
#'
#' @param model_old model created on historical / `old` data
#' @param data_old data frame with historical / `old` data
#' @param data_new data frame with current / `new` data
#' @param y_old true values of target variable for historical / `old` data
#' @param y_new true values of target variable for current / `new` data
#' @param predict_function function that takes two arguments: model and new data and returns numeric vector with predictions, by default it's `predict`
#' @param bins continuous variables are discretized to `bins` intervals of equal sizes
#'
#' @return an object of a class `covariate_drift` (data.frame) with Non-Intersection Distances calculated for residuals
#' @export
#'
#' @examples
#'  library("DALEX")
#'  model_old <- lm(m2.price ~ ., data = apartments)
#'  model_new <- lm(m2.price ~ ., data = apartments_test[1:1000,])
#'  calculate_model_drift(model_old, model_new,
#'                   apartments_test[1:1000,],
#'                   apartments_test[1:1000,]$m2.price)
#'  \donttest{
#'  library("ranger")
#'  predict_function <- function(m,x,...) predict(m, x, ...)$predictions
#'  model_old <- ranger(m2.price ~ ., data = apartments)
#'  calculate_residuals_drift(model_old,
#'                        apartments_test[1:4000,], apartments_test[4001:8000,],
#'                        apartments_test$m2.price[1:4000], apartments_test$m2.price[4001:8000],
#'                        predict_function = predict_function)
#'  calculate_residuals_drift(model_old,
#'                        apartments, apartments_test,
#'                        apartments$m2.price, apartments_test$m2.price,
#'                        predict_function = predict_function)
#' }
#'
calculate_residuals_drift <- function(model_old,
                                  data_old, data_new,
                                  y_old, y_new,
                                  predict_function = predict,
                                  bins = 20) {
  #
  # distance between residuals
  residuals_old <- y_old - predict_function(model_old, data_old)
  residuals_new <- y_new - predict_function(model_old, data_new)
  residuals_distance <- calculate_distance(residuals_old, residuals_new, bins = bins)

  df <- data.frame(variables = "Residuals",
                   drift = residuals_distance)
  class(df) <- c("covariate_drift", "data.frame")
  df
}


#' Calculates distance between two Ceteris Paribus Profiles
#'
#' This function calculates square root from mean square difference between Ceteris Paribus Profiles
#'
#' @param cpprofile_old Ceteris Paribus Profile for historical / `old` model
#' @param cpprofile_new Ceteris Paribus Profile for current / `new` model
#' @param variables variables for which drift should be calculated
#' @param scale scale parameter for calculation of scaled drift
#'
#' @return data frame with distances between Ceteris Paribus Profiles
#' @export
compare_two_profiles <- function(cpprofile_old, cpprofile_new, variables, scale = 1) {
  # clean check()
  `_label_` <- `_vname_` <- `_yhat_` <- avg <- x <- NULL

  distances <- numeric(length(variables))
  for (i in seq_along(variables)) {
    var <- variables[i]
    selected_var_old <- filter(cpprofile_old, `_vname_` == var)
    selected_var_new <- filter(cpprofile_new, `_vname_` == var)
    selected_var_old <- selected_var_old[,c("_x_", "_yhat_", "_label_")]
    selected_var_new <- selected_var_new[,c("_x_", "_yhat_", "_label_")]
    selected_var <- rbind(selected_var_old, selected_var_new)
    colnames(selected_var)[1] <- "x"

    selected_avg <- summarise(group_by(selected_var, x, `_label_`),
                              avg = mean(`_yhat_`, na.rm = TRUE))
    tmp <- spread(selected_avg, `_label_`, avg)
    distances[i] <- sqrt(mean((tmp$model_new - tmp$model_old)^2))
  }

  df <- data.frame(variables = variables,
                   drift = distances,
                   drift_scaled = distances/scale)
  df
}


#' Print Model Drift Data Frame
#'
#' @param x an object of the class `model_drift`
#' @param max_length length of the first column, by default 25
#' @param ... other arguments, currently ignored
#'
#' @return this function prints a data frame with a nicer format
#' @export
#'
#' @examples
#'  library("DALEX")
#'  model_old <- lm(m2.price ~ ., data = apartments)
#'  model_new <- lm(m2.price ~ ., data = apartments_test[1:1000,])
#'  calculate_model_drift(model_old, model_new,
#'                   apartments_test[1:1000,],
#'                   apartments_test[1:1000,]$m2.price)
#'  \donttest{
#'  library("ranger")
#'  predict_function <- function(m,x,...) predict(m, x, ...)$predictions
#'  model_old <- ranger(m2.price ~ ., data = apartments)
#'  model_new <- ranger(m2.price ~ ., data = apartments_test)
#'  calculate_model_drift(model_old, model_new,
#'                   apartments_test,
#'                   apartments_test$m2.price,
#'                   predict_function = predict_function)
#'
#'  # here we compare model created on male data
#'  # with model applied to female data
#'  # there is interaction with age, and it is detected here
#'  predict_function <- function(m,x,...) predict(m, x, ..., probability=TRUE)$predictions[,1]
#'  data_old = HR[HR$gender == "male", -1]
#'  data_new = HR[HR$gender == "female", -1]
#'  model_old <- ranger(status ~ ., data = data_old, probability=TRUE)
#'  model_new <- ranger(status ~ ., data = data_new, probability=TRUE)
#'  calculate_model_drift(model_old, model_new,
#'                   HR_test,
#'                   HR_test$status == "fired",
#'                   predict_function = predict_function)
#'
#'  # plot it
#'  library("ingredients")
#'  prof_old <- partial_dependency(model_old,
#'                                      data = data_new[1:1000,],
#'                                      label = "model_old",
#'                                      predict_function = predict_function,
#'                                      grid_points = 101,
#'                                      variable_splits = NULL)
#'  prof_new <- partial_dependency(model_new,
#'                                      data = data_new[1:1000,],
#'                                      label = "model_new",
#'                                      predict_function = predict_function,
#'                                      grid_points = 101,
#'                                      variable_splits = NULL)
#'  plot(prof_old, prof_new, color = "_label_")
#'
#' }
#'
print.model_drift <- function(x, max_length = 25, ...) {
  ntmp <- as.character(x$variables)
  numr <- sprintf("%3.2f", x$drift)
  numr2 <- paste0(substr(rep("     ", length(numr)), 1, 6 - nchar(numr)), numr)
  nums <- sprintf("%3.1f", round(100*x$drift_scaled,1))
  nums2 <- paste0(substr(rep("     ", length(nums)), 1, 6 - nchar(nums)), nums)
  nams <- sapply(ntmp, function(j) paste0(substr("                          ", 1,
                                                 pmax(max_length - nchar(j), 0)),
                                          substr(j, 1, max_length),
                                          " "))
  stars <- paste0(ifelse((x$drift_scaled > 0.1) & (x$drift_scaled < 0.2), ".", ""),
                  ifelse(x$drift_scaled > 0.2, "*", ""),
                  ifelse(x$drift_scaled > 0.3, "*", ""),
                  ifelse(x$drift_scaled > 0.4, "*", ""))

  cat("                  Variable    Shift  Scaled\n")
  cat("  -----------------------------------------------\n")
  cat("",paste0(nams, "  ", numr2, "  ", nums2, "  ", stars,"\n"))
  return(invisible(x))
}
Any scripts or data that you put into this service are public.
drifter documentation built on May 31, 2019, 5:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
drifter
Concept Drift and Concept Shift Detection for Predictive Models

R/calculate_model_drift.R
In drifter: Concept Drift and Concept Shift Detection for Predictive Models

Try the drifter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

drifter Concept Drift and Concept Shift Detection for Predictive Models

R/calculate_model_drift.R In drifter: Concept Drift and Concept Shift Detection for Predictive Models

Try the drifter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

drifter
Concept Drift and Concept Shift Detection for Predictive Models

R/calculate_model_drift.R
In drifter: Concept Drift and Concept Shift Detection for Predictive Models