#' Function performs measures of influence to the submitted OLS model
#'
#' Function offers three different measures of influence including
#' internal/external studentized residuals, and Cook's distance.
#' Both a data frame and a plot of each observations influence on the OLS estimate are
#' returned to help identify data points as possible outliers and/or high leverage.
#'
#' @param df A data frame with columns for observed response and predictors
#' @param formula_obj A formula object following the rules of \code{stats::lm()} construction.
#' For example: \code{y ~ log(a) + b + I(b^2)}.
#' @param id_col An optional argument that names the column from data frame 'df' providing
#' each observation with a unique identification value. If this argument is \code{NULL} then
#' data frame row numbers are used for identification. Unless you have less than 30 observations,
#' it is best to stay with row numbers and modify the 'x_limits' and 'x_major_breaks'
#' arguments.
#' @param influence_meas A string that defines the type of influence measure to apply. Acceptable
#' values include "internal", "external", "dffits", and "cook" for
#' internal/external studentized residuals, difference in fits, and Cook's distance respectively.
#' @param label_threshold A numeric that sets the measurement threshold beyond which observations
#' will be labeled with their id.
#' @param label_color A string that sets the label/point color for observations whose absolute
#' measurement is greater than the 'label_threshold'.
#' @param title A string that sets the plot title.
#' @param subtitle A string that sets the plot subtitle.
#' @param x_title A string that sets the observed response x axis title. If \code{NULL} then
#' the x axis title does not appear. The default is "Observation ID".
#' @param y_title A string that sets the fitted response y axis title. If \code{NULL} then
#' the y axis title does not appear. The default is "Influence Value".
#' @param rot_y_tic_label A logical which if \code{TRUE} rotates the y tic labels 90 degrees for enhanced readability.
#' @param x_limits A numeric 2 element vector that sets the minimum and maximum for the x axis.
#' @param x_major_breaks A numeric vector or function that defines the exact major tic locations along the x axis.
#' @param y_limits A numeric 2 element vector that sets the minimum and maximum for the y axis.
#' Use \code{NA} to refer to the existing minimum and maximum.
#' @param y_major_breaks A numeric vector or function that defines the exact major tic locations along the y axis.
#' @param y_minor_breaks A numeric vector or function that defines the exact minor tic locations along the y axis.
#' @param axis_text_size A numeric that sets the font size along the axis'. Default is 11.
#' @param pts_color A string that sets the color of the points.
#' @param pts_fill A string that sets the fill color of the points.
#' @param pts_shape A numeric integer that sets the shape of the points. Typical values are 21 \dQuote{circle},
#' 22 \dQuote{square}, 23 \dQuote{diamond}, 24 \dQuote{up triangle}, 25 \dQuote{down triangle}.
#' @param pts_stroke A numeric that sets the drawing width for a point shape.
#' @param pts_alpha A numeric value that sets the alpha level of \code{pts_color}.
#' @param pts_size A numeric value that sets the size of the points.
#' @param show_major_grids A logical that controls the appearance of major grids.
#' @param show_minor_grids A logical that controls the appearance of minor grids.
#'
#' @return Function returns a named list with both a data frame of influence measures
#' \dQuote{influence_df} for each observation along with a scatter plot of observations versus
#' influence measure \dQuote{influence_plot}.
#'
#' @examples
#' library(wooldridge)
#' library(ggplot2)
#' library(data.table)
#' library(RplotterPkg)
#' library(RregressPkg)
#'
#' rdchem_dt <- data.table::as.data.table(wooldridge::rdchem) |>
#' _[, .(rdintens, sales, profmarg)]
#'
#' formula_obj <- rdintens ~ sales + profmarg
#' rdchem_influence_lst <- RregressPkg::plot_influence(
#' df = rdchem_dt,
#' formula_obj = formula_obj,
#' influence_meas = "cook",
#' label_threshold = 3.0,
#' title = "Cook's Distance for Data Point Influence",
#' subtitle = "Source: Wooldridge::rdchem",
#' rot_y_tic_label = TRUE
#' )
#' a_plot <- rdchem_influence_lst$plot
#'
#' @import ggplot2
#' @importFrom RplotterPkg create_scatter_plot
#'
#' @export
plot_influence <- function(
df = NULL,
formula_obj = NULL,
id_col = NULL,
influence_meas = "cook",
label_threshold = 3,
label_color = "red",
title = NULL,
subtitle = NULL,
x_title = "Observation ID",
y_title = "Influence Value",
rot_y_tic_label = FALSE,
x_limits = NULL,
x_major_breaks = waiver(),
y_limits = NULL,
y_major_breaks = waiver(),
y_minor_breaks = waiver(),
axis_text_size = 11,
pts_color = "black",
pts_fill = "white",
pts_shape = 21,
pts_stroke = 1,
pts_alpha = 1.0,
pts_size = 1,
show_major_grids = TRUE,
show_minor_grids = TRUE
){
# compute the OLS
X_mt <- stats::model.matrix(formula_obj, data = df)
Y_mt <- as.matrix(subset(stats::model.frame(formula_obj, data = df),select = 1))
n <- nrow(X_mt)
k <- ncol(X_mt)
qr_lst <- base::qr(X_mt)
Qf_mt <- base::qr.Q(qr_lst) # n x k
Hat_ii <- diag(Qf_mt %*% t(Qf_mt)) # n x 1
fitted_mt <- (Qf_mt %*% t(Qf_mt) %*% Y_mt) # n x 1
resid_mt <- base::qr.resid(qr_lst, Y_mt) # n x 1
sse <- (t(resid_mt) %*% resid_mt)[1,1]
mse <- sse/(n - k)
if(influence_meas == "internal"){
# compute studentized residuals or internally studentized residuals
influence_vals <- resid_mt / sqrt(mse * (1 - Hat_ii))
}else if(influence_meas == "external"){
# compute the studentized deleted residuals or externally studentized residuals
student_residuals <- resid_mt / sqrt(mse * (1 - Hat_ii))
numer <- n - k - 1
denom <- n - k - student_residuals^2
influence_vals <- student_residuals * sqrt(numer/denom)
}else if(influence_meas == "dffits"){
# compute the Difference in Fits(DFFITS)
influence_vals <- vector(mode="numeric", length = n)
for(i in 1:n){
X_i_mt <- X_mt[-i,,drop=F]
Y_i_mt <- Y_mt[-i,,drop=F]
qr_i_lst <- base::qr(X_i_mt)
coef_i_mt <- (base::qr.coef(qr_i_lst, Y_i_mt))
fitted_i_mt <- X_mt %*% coef_i_mt
dif_fit_i_mt <- fitted_mt - fitted_i_mt
resid_i_mt <- base::qr.resid(qr_i_lst, Y_i_mt)
sse_i <- (t(resid_i_mt) %*% resid_i_mt)[1,1]
mse_i <- sse_i/(n - k - 1)
denom <- sqrt(mse_i * Hat_ii)
influence_vals[[i]] <- (dif_fit_i_mt/denom)[i,]
}
influence_vals <- as.data.frame(influence_vals)
}else if(influence_meas == "cook"){
# compute Cook's distance measure
influence_vals <- vector(mode="numeric", length = n)
for(i in 1:n){
Residual <- ((resid_mt)^2)/(k * mse)
Leverage <- Hat_ii/(1 - Hat_ii)^2
influence_vals[[i]] <- (Residual * Leverage)[i,]
}
influence_vals <- as.data.frame(influence_vals)
}
# create a plot object of the influence values
id <- NULL
if(!is.null(id_col)){
id_v <- df[[id_col]]
influence_df <- data.frame(
id = factor(id_v, levels = id_v)
)
}else {
id_v <- 1: nrow(df)
influence_df <- data.frame(
id = id_v
)
}
colnames(influence_vals) <- "influence_vals"
influence_df <- cbind(influence_df, influence_vals)
influence_plot <- RplotterPkg::create_scatter_plot(
df = influence_df,
aes_x = "id",
aes_y = "influence_vals",
title = title,
subtitle = subtitle,
x_title = x_title,
y_title = y_title,
rot_y_tic_label = rot_y_tic_label,
x_limits = x_limits,
x_major_breaks = x_major_breaks,
y_limits = y_limits,
y_major_breaks = y_major_breaks,
y_minor_breaks = y_minor_breaks,
axis_text_size = axis_text_size,
pts_color = pts_color,
pts_shape = pts_shape,
pts_stroke = pts_stroke,
pts_line_alpha = pts_alpha,
pts_size = pts_size,
show_major_grids = show_major_grids,
show_minor_grids = show_minor_grids
)
label_data_df <- subset(influence_df, abs(influence_vals) >= label_threshold)
if(nrow(label_data_df) > 0){
influence_plot <- influence_plot +
ggplot2::geom_point(data = label_data_df, color = label_color, size = 2.5) +
ggplot2::geom_text(
data = label_data_df,
aes(
label = id,
hjust = -0.2,
vjust = -0.4
),
color = label_color
)
}
return(
list(
influence = influence_df,
plot = influence_plot
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.