R/dsldQeFairML.R
In dsld: Data Science Looks at Discrimination

Documented in dsldQeFairKNN dsldQeFairRF dsldQeFairRidgeLin dsldQeFairRidgeLog predict.dsldQeFair

# -------------------------------- QeFairBase ----------------------------------
# base function to use in qefairRF and qefairKNN. 
# returns: list with the class "dsldQeFair"
qeFairBase <- function(qeFUNC, data, yName, sNames, scaling, ...,
                       appendedItems = list()) {
  
  # scale data / expand factors, before training the model
  train <- fairScale(data, yName, sNames, scaling)
  base <- qeFUNC(train, yName, ...)
  
  model <- list(base = base)
  model <- append(model, appendedItems)
  model$scalePars <- attr(train, "scalePars")
  model$FactorsInfo = factor_levels(data)

  # return s3 object
  class(model) <- c("dsldQeFair")
  return(model)
}

# ------------------------------------- QeFairRF -------------------------------

# No scaling. Appends variables to the final model output via appendedItems
# Deweights are expanded to match the scaled data

dsldQeFairRF <- function(data, yName, sNames, deweightPars = NULL, nTree = 500,
                         minNodeSize = 10, mtry = floor(sqrt(ncol(data))),
                         yesYVal = NULL) {
  # setup
  scaling <- FALSE
  appendedItems <- variablesAsList(yName, sNames, scaling, deweightPars)

  if (is.factor(data[[yName]]) && is.null(yesYVal)) {
    stop("Missing `yesYVal` for a factor response.")
  } 
  
  if (!is.factor(data[[yName]]) && !is.null(yesYVal)) {
    yesYVal = NULL
  } 
  
  # expand deweights to match the expanded factors in the data in qeFairBase
  expandedDW <- expandDeweights(deweightPars, data[1, ])
  model <- qeFairBase(qeML::qeRFranger,
                      data,
                      yName,
                      sNames,
                      scaling,
                      nTree = nTree,
                      minNodeSize = minNodeSize,
                      mtry = mtry,
                      yesYVal = yesYVal,
                      holdout = NULL,
                      deweightPars = expandedDW,
                      appendedItems = appendedItems
  )
  # identifier for predict-time logic and y levels (classification)
  model$func_call <- "dsldQeFairRF"
  if (!is.null(yesYVal)) {
    model$yName_info <- levels(data[[yName]])
  }

  # training preds/corrs
  model$trainPreds <- predict.dsldQeFair(model, data[,!colnames(data) %in% c(yName)])
  
  if (!is.null(yesYVal)) {
    # classification case
    # map 0/1 predClasses to original factor labels for accuracy comparison
    y_levels <- levels(data[[yName]])
    noYVal <- setdiff(y_levels, yesYVal)[1]
    raw_pred <- model$trainPreds$preds$predClasses
    pred_classes_mapped <- ifelse(as.numeric(as.character(raw_pred)) == 1, yesYVal, noYVal)
    pred_classes_mapped <- factor(pred_classes_mapped, levels = y_levels)
    # overwrite extractable predClasses with mapped factor labels
    model$trainPreds$preds$predClasses <- pred_classes_mapped

    model$trainAcc <- mean(data[[yName]] != pred_classes_mapped)
    model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds$probs[,2])
  } else {
    # regression case
    model$trainAcc <- mean(abs(model$trainPreds$preds - data[[yName]]))
    model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds)
  }

  return(model)
}

# ------------------------------ QeFairKNN -------------------------------------

# May be scaled. Appends variables to the final model output via appendedItems
# Deweights are expanded to match the scaled data

dsldQeFairKNN <- function(data, yName, sNames, deweightPars = NULL,
                          yesYVal = NULL, k = 25, scaleX = TRUE) {
  # setup for easier calls
  scaling <- scaleX
  appendedItems <- variablesAsList(yName, sNames, scaling, deweightPars)
  
  # expand deweights to match the expanded factors in the data in qeFairBase
  expandedDW <- expandDeweights(deweightPars, data[1, ])
  expandVars <- names(expandedDW)
  expandVals <- unlist(expandedDW) # to use as qeKNN pars
  
  if (is.factor(data[[yName]]) && is.null(yesYVal)) {
    stop("Missing `yesYVal` for a factor response.")
  } 
  
  if (!is.factor(data[[yName]]) && !is.null(yesYVal)) {
    yesYVal = NULL
  } 
  
  model <- qeFairBase(qeML::qeKNN,
                      data,
                      yName,
                      sNames,
                      scaling,
                      k = k,
                      yesYVal = yesYVal,
                      holdout = NULL,
                      expandVars = expandVars,
                      expandVals = expandVals,
                      appendedItems = appendedItems
  )
  # identifier for predict-time logic and y levels (classification)
  model$func_call <- "dsldQeFairKNN"
  if (!is.null(yesYVal)) {
    model$yName_info <- levels(data[[yName]])
  }
  
  # training preds/corrs
  model$trainPreds <- predict.dsldQeFair(model, data[,!colnames(data) %in% c(yName)])

  if (!is.null(yesYVal)) {
    # classification case
    model$trainAcc <- mean(data[[yName]] != model$trainPreds$preds$predClasses)
    model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds$probs)
  } else {
    # regression case
    model$trainAcc <- mean(abs(model$trainPreds$preds - data[[yName]]))
    model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds)
  }
  
  return(model)
}

# -------------------------------- QeFairRidgeBase -----------------------------
#
# base function for the ridge models. 
# Creates its own training and testing sets 
# training set is modified w/ ridgeModify, according to EDF paper
# trains a qeML model with that training set as the base of the output list
# appends additional variables via variables as list
# appends testAcc, baseAcc, holdoutPreds with the testing set via predictHoldoutFair
#
qeFairRidgeBase <- function(data, yName, sNames, deweightPars, yesYVal = 0) {
  
  linear <- yesYVal == 0     # if we're using the default yesYVal, assume its linear
  
  # perform formula described in edffair paper
  dataExtended <- ridgeModify(data, yName, sNames, deweightPars, yesYVal)
  scalePars <- attr(dataExtended, "scalePars")
  
  base <- 
    if (linear) qeML::qeLin(dataExtended, yName, holdout = NULL)
  else          qeML::qeLogit(dataExtended, yName, holdout = NULL, yesYVal = yesYVal)
  
  # Append these variables to the model object
  classif <- !linear
  scaling <- TRUE
  appendedItems <- variablesAsList(
    sNames,
    yName,
    deweightPars,
    scaling,
    scalePars,
    classif,
    yesYVal
  )
  
  # construct output model object
  model <- list(base = base)
  model <- append(model, appendedItems)
  model$FactorsInfo = factor_levels(data)
  
  # return s3 object
  class(model) <- c("dsldQeFair")
  return(model)
}

# modify the training data as described in the edf fair paper
# essentially, you append a diagonal matrix to the bottom of the training data
# with each value of the diagonal optionally having a deweighting value
# the yCol is expanded w/ 0s or a no
# yesYVal - the yes value in the binary yName. if 0,
# we're assuming its a linear model

ridgeModify <- function(data, yName, sNames, deweightPars, yesYVal = 0) {
  # setup expanddeweighting
  expandDW <- expandDeweights(deweightPars, data[1, ])
  expandVars <- names(expandDW)
  expandVals <- unlist(expandDW)
  
  data <- fairScale(data, yName, sNames, scaling = TRUE)
  
  xNames <- colnames(data[,-ncol(data)])
  noYVal <- setdiff(levels(data[,yName]), yesYVal)[[1]]
  yBlank <- if (yesYVal == 0) 0 else noYVal
  p <- ncol(data) - 1   # how many predictors
  
  # formula described in edffair paper
  D <- setNames(rep(0, p), xNames)
  if (!is.null(deweightPars)) {
    D[expandVars] <- sqrt(expandVals)
  }
  
  extension <- data.frame(diag(D))
  extension <- cbind(extension, rep(yBlank, p))
  
  names(extension) <- colnames(data)
  dataExtended <- rbind(data, extension)
  
  attr(dataExtended, "scalePars") <- attr(data, "scalePars")
  return(dataExtended)
}


### dsldQeFairRidgeLin
dsldQeFairRidgeLin <- function(data, yName, sNames, deweightPars = NULL) {
  model = qeFairRidgeBase(data, yName, sNames, deweightPars)
  model$trainPreds <- predict.dsldQeFair(model, data[,!colnames(data) %in% c(yName)])
  model$trainAcc <- mean(abs(model$trainPreds$preds - data[[yName]]))
  model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds)
  return(model)
}


# dsldQeFairRidgeLog
# works in the binary classification case
dsldQeFairRidgeLog <- function(data, yName, sNames, deweightPars = NULL, yesYVal) {
  model = qeFairRidgeBase(data, yName, sNames, deweightPars, yesYVal)
  # identifier and y levels for classification use later
  model$func_call <- "dsldQeFairRidgeLog"
  model$yName_info <- levels(data[[yName]])
  model$trainPreds <- predict.dsldQeFair(model, data[,!colnames(data) %in% c(yName)])
  model$trainAcc <- mean(data[[yName]] != model$trainPreds$preds$predClasses)
  model$trainCorrs <- s_correlations(data, sNames, model$trainPreds$preds$probs)
  return(model)
}

# ------------------------------- Predict() ------------------------------------
predict.dsldQeFair <- function(object, newx, ...) {
  # extract params from the model
  #yName <- model$yName
  sNames <- as.vector(object$sNames)
  scaling <- object$scaling
  scalePars <- object$scalePars
  
  
  newxNoS <- newx[, !colnames(newx) %in% c(sNames)]
  newxNoS <- apply_factor_levels(newxNoS, object$FactorsInfo)
  
  # rescale the data according to how the training data was scaled in the model
  newxNoS <- scaleNewX(newxNoS, scaling, scalePars)
  preds = predict(object$base, newxNoS)
  if (is.list(preds)) {
    if (is.null(dim(preds$probs))) {
      # it's a simple vector, already length n
      probs <- preds$probs
    } else {
      # it's a matrix/data frame with 2 columns (0/1)
      probs <- preds$probs[,1]
    }
    cors <- s_correlations(newx, sNames, probs)
  } else {
    cors = s_correlations(newx, sNames, preds)
  }
  return(list(preds = preds, correlations = cors))
}

# predict.dsldQeFair <- function(object, newx, ...) {
#   # extract params from the model
#   #yName <- model$yName
#   sNames <- as.vector(object$sNames)
#   scaling <- object$scaling
#   scalePars <- object$scalePars
  
  
#   newxNoS <- newx[, !colnames(newx) %in% c(sNames)]
#   newxNoS <- apply_factor_levels(newxNoS, object$FactorsInfo)
  
#   # rescale the data according to how the training data was scaled in the model
#   newxNoS <- scaleNewX(newxNoS, scaling, scalePars)
#   preds = predict(object$base, newxNoS)
#   if (is.list(preds)) {
#     if (is.null(dim(preds$probs))) {
#       # it's a simple vector, already length n
#       probs <- preds$probs
#     } else {
#       # it's a matrix/data frame with 2 columns (0/1)
#       probs <- preds$probs[,1]
#     }

#     # If RF classification, map 0/1 predClasses to original factor labels
#     if (identical(object$func_call, "dsldQeFairRF")) {
#       y_levels <- object$yName_info
#       yesYVal <- object$yesYVal
#       noYVal <- setdiff(y_levels, yesYVal)[1]
#       raw_num <- suppressWarnings(as.numeric(as.character(preds$predClasses)))
      
#       pred_classes_mapped <- ifelse(raw_num == 1, yesYVal, noYVal)
#       pred_classes_mapped <- factor(pred_classes_mapped, levels = y_levels)
#       # overwrite extractable predClasses with mapped factor labels
#       preds$predClasses <- pred_classes_mapped

#     }

#     cors <- s_correlations(newx, sNames, probs)
#   } else {
#     cors = s_correlations(newx, sNames, preds)
#   }
#   return(list(preds = preds, correlations = cors))
# }

### helper functions

# ------------- data scaling --------------
# factor types are split into binary columns for each level
# sNames and yName are removed in order for them to be scaled
# yName is added back.
# scaledPars are added as an attribute in order to rescale the newx in predict
# scaling - wether or not the data is scaled
# returns - a dataframe with a scalePars attribute

fairScale <- function(data, yName, sNames, scaling=FALSE) {
  predictors <- data[,!colnames(data) %in% c(sNames, yName)]
  response <- data[,yName]
  
  # expand factors
  predictors <- regtools::factorsToDummies(predictors, omitLast = TRUE)
  if (scaling) { 
    predictors <- scale(predictors)
    # save scalePars in order to rescale newx in predict function
    scalePars <- list(
      center=attr(predictors,'scaled:center'),
      scale=attr(predictors,'scaled:scale')
    )
  }
  # put it back together
  data <- cbind(data.frame(predictors), response)
  colnames(data)[ncol(data)] <- yName
  
  if (scaling) attr(data, 'scalePars') <- scalePars
  data
}

# factor types are split into binary columns for each level
# newx is scaled to match how the training data was scaled
# newx      - data to be converted before predicted on
# scaling   - was the model trained on scaled data
# scalePars - a list of (center, scale) that describes how the data was scaled
# returns   - a dataframe
scaleNewX <- function(newx, scaling=FALSE, scalePars=NULL) {
  newx <- regtools::factorsToDummies(newx, omitLast = TRUE)
  
  if (scaling) { 
    center <- TRUE; scale <- TRUE
    
    if (!is.null(scalePars)) { 
      center <- scalePars$center
      scale <- scalePars$scale
    }
    
    newx <- scale(newx, center=center, scale=scale)
  }
  
  newx <- data.frame(newx) # this changes the colnames for some reason
  newx
}

# --------- Expand Deweights --------------
# if one of the deweights is a factor, expand it into a dummy for all the qeFairs
#
# deweightPars - in the form of list(var_name=value)
# row1         - doesnt have to be 1 row, but the only thing this needs is
#                information on the data type and name of every column
#
# returns      - an expanded list(var_value=value) 
expandDeweights <- function(deweightPars, row1) {
  pars <- list()
  for (item in names(deweightPars)) {  # loop through every item
    if (is.factor(row1[,item])) {      # expand deweight if factor type
      names <- names(
        data.frame(
          regtools::factorToDummies(row1[,item], item, omitLast = TRUE)
        )
      )
      expanded <- Map(function(x) deweightPars[[item]], names)
      pars <- append(pars, expanded)
    } else {                          
      pars <- append(pars, deweightPars[item])  # add deweight as usual
    }
  }
  pars
}

# ------------ Utils -----------------
# utility to add variables to a list as an item w/ their own name
# ex. 'model$yName <- yName' for a list of variables
#
# ...     - one or more variables to be turned into a list
#
# returns - a list
variablesAsList <- function(...) {
  names <- as.list(substitute(list(...)))[-1]
  values <- list(...)
  list <- setNames(values, names)
  list
}

Any scripts or data that you put into this service are public.

dsld documentation built on Sept. 14, 2025, 1:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dsld
Data Science Looks at Discrimination

R/dsldQeFairML.R
In dsld: Data Science Looks at Discrimination

Defines functions variablesAsList expandDeweights scaleNewX fairScale predict.dsldQeFair dsldQeFairRidgeLog dsldQeFairRidgeLin ridgeModify qeFairRidgeBase dsldQeFairKNN dsldQeFairRF qeFairBase

Documented in dsldQeFairKNN dsldQeFairRF dsldQeFairRidgeLin dsldQeFairRidgeLog predict.dsldQeFair

Try the dsld package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dsld Data Science Looks at Discrimination

R/dsldQeFairML.R In dsld: Data Science Looks at Discrimination

Defines functions variablesAsList expandDeweights scaleNewX fairScale predict.dsldQeFair dsldQeFairRidgeLog dsldQeFairRidgeLin ridgeModify qeFairRidgeBase dsldQeFairKNN dsldQeFairRF qeFairBase

Documented in dsldQeFairKNN dsldQeFairRF dsldQeFairRidgeLin dsldQeFairRidgeLog predict.dsldQeFair

Try the dsld package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dsld
Data Science Looks at Discrimination

R/dsldQeFairML.R
In dsld: Data Science Looks at Discrimination