R/33_FAIRNESS_VLD.R
In PDtoolkit: Collection of Tools for PD Rating Model Development and Validation

Documented in fairness.vld

#' Model fairness validation
#'
#' \code{fairness.vld} performs fairness validation for a given sensitive attribute and selected outcome.
#' Sensitive attribute should be categorical variable with reasonable number of modalities, while
#' outcome can be categorical (e.g. reject/accept indicator or rating grade) or continuous (e.g. interest rate or amount).
#' Depending on model type outcome (see argument \code{mod.outcome.type}) Chi-square test or Wald test is applied.
#'@param db Data frame with sensitive attribute, observed outcome, model outcome and conditional attribute.
#'@param sensitive Name of sensitive attribute within \code{db}.
#'@param obs.outcome Name of observed outcome within \code{db}.
#'@param mod.outcome Name of model outcome within \code{db}.
#'@param conditional Name of conditional attribute within \code{db}. It is used for calculation of conditional
#'			   statistical parity. Default value is \code{NULL}.
#'@param mod.outcome.type Type of model outcome. Possible values are \code{disc} (discrete outcome) 
#'				   and \code{cont} (continuous).
#'@param p.value Significance level of applied statistical test (chi-square or Wald test).
#'@return The command \code{fairness.vld} returns a list of three data frames.\cr
#'	    The first object (\code{SP}), provides results of statistical parity testing.\cr
#'	    The second object (\code{CSP}), provides results of conditional statistical parity testing.
#'	    This object will be returned only if conditional attributed is supplied. \cr
#'	    The third object (\code{EO}), provides results of equal opportunity testing.\cr
#'@references
#'Hurlin, Christophe and Perignon, Christophe and Saurin, Sebastien (2022), 
#'The Fairness of Credit Scoring Models. HEC Paris Research Paper No. FIN-2021-1411
#'@examples
#'suppressMessages(library(PDtoolkit))
#'#build hypothetical model
#'data(loans)
#'#numeric risk factors
#'#num.rf <- sapply(loans, is.numeric)
#'#num.rf <- names(num.rf)[!names(num.rf)%in%"Creditability" & num.rf]
#'num.rf <- c("Credit Amount", "Age (years)")
#'#discretized numeric risk factors using ndr.bin from monobin package
#'loans[, num.rf] <- sapply(num.rf, function(x) 
#'ndr.bin(x = loans[, x], y = loans[, "Creditability"])[[2]])
#'str(loans)
#'#run stepMIV
#'rf <- c("Account Balance", "Payment Status of Previous Credit", 
#'        "Purpose", "Value Savings/Stocks", "Credit Amount",
#'        "Age (years)", "Instalment per cent", "Foreign Worker")
#'res <- stepMIV(start.model = Creditability ~ 1, 
#'	   miv.threshold = 0.02, 
#'	   m.ch.p.val = 0.05,
#'	   coding = "WoE",
#'	   coding.start.model = FALSE,
#'	   db = loans[, c("Creditability", rf)])
#'#print coefficients
#'summary(res$model)$coefficients
#'
#'#prepare data frame for fairness validation
#'db.fa <- data.frame(Creditability = loans$Creditability, 
#'			  mpred = predict(res$model, type = "response", newdata = res$dev.db))
#'#add hypothetical reject/accept indicator 
#'db.fa$rai <- ifelse(db.fa$mpred > 0.5, 1, 0)
#'#add hypothetical rating
#'db.fa$rating <- sts.bin(x = round(db.fa$mpred, 4), y = db.fa$Creditability)[[2]]
#'#add hypothetical interest rate
#'ir.r <- seq(0.03, 0.10, length.out = 6)
#'names(ir.r) <- sort(unique(db.fa$rating))
#'db.fa$ir <- ir.r[db.fa$rating]
#'#add hypothetical sensitive attribute
#'db.fa$sensitive.1 <- ifelse(loans$"Sex & Marital Status"%in%2, 1, 0) #not in a model
#'db.fa$sensitive.2 <- ifelse(loans$"Age (years)"%in%"03 [35,Inf)", 1, 0) #in a model
#'#add some attributes for calculation of conditional statistical parity
#'db.fa$"Credit Amount" <- loans$"Credit Amount" 
#'head(db.fa)
#'
#'#discrete model outcome - sensitive attribute not in a model
#'fairness.vld(db = db.fa, 
#'		 sensitive = "sensitive.1", 
#'		 obs.outcome = "Creditability", 
#'		 mod.outcome = "rai",
#'		 conditional = "Credit Amount", 
#'		 mod.outcome.type = "disc", 
#'		 p.value = 0.05)
#'##discrete model outcome - sensitive attribute in a model
#'#fairness.vld(db = db.fa, 
#'#		 sensitive = "sensitive.2", 
#'#		 obs.outcome = "Creditability", 
#'#		 mod.outcome = "rai",
#'#		 conditional = "Credit Amount", 
#'#		 mod.outcome.type = "disc", 
#'#		 p.value = 0.05)
#'##continuous outcome - sensitive attribute not in a model
#'#fairness.vld(db = db.fa, 
#'#		 sensitive = "sensitive.1", 
#'#		 obs.outcome = "Creditability", 
#'#		 mod.outcome = "ir",
#'#		 conditional = "Credit Amount", 
#'#		 mod.outcome.type = "cont", 
#'#		 p.value = 0.05)
#'#continuous outcome - sensitive attribute in a model
#'fairness.vld(db = db.fa, 
#'		 sensitive = "sensitive.2", 
#'		 obs.outcome = "Creditability", 
#'		 mod.outcome = "ir",
#'		 conditional = "Credit Amount", 
#'		 mod.outcome.type = "cont", 
#'		 p.value = 0.05)
#'@importFrom stats chisq.test lm
#'@export
fairness.vld <- function(db, sensitive, obs.outcome, mod.outcome, conditional = NULL, mod.outcome.type, p.value) {
	if	(!is.data.frame(db)) {
		stop("db is not a data frame.")
		}
	y <- db[, obs.outcome]
	cond.00 <- !sum(y[!is.na(y)]%in%c(0, 1)) == length(y[!is.na(y)])
	if	(cond.00) {
		stop("obs.outcome has to be 0/1 variable.")
		}
	if	(any(!c(sensitive, obs.outcome, mod.outcome, conditional)%in%names(db))) {
		stop("one of arguments sensitive, mod.outcome, obs.outcome or conditional cannot be found in the db data frame.")
		}
	mod.outcome.type.opt <- c("disc", "cont")
	if	(!mod.outcome.type%in%mod.outcome.type.opt) {
		stop(paste0("mod.outcome.type argument has to be one of: ", 
				paste0(mod.outcome.type.opt, collapse = ', '), "."))
		}
	if	((!is.numeric(p.value) | !length(p.value) == 1) |
		 !(p.value[1] > 0 & p.value[1] < 1)) {
		stop("p.value has to be of single numeric value vector greater than 0 and less then 1.")
		}
	cc <- complete.cases(db[, c(sensitive, obs.outcome, mod.outcome)])
	db <- db[cc, ]
	if	(nrow(db) == 0) {
		stop("No complete cases for db.")
		}
	if	(any(!cc)) {
		warning("There are some incomplete cases. Check sensitive, obs.outcome and mod.outcome columns.")
		}
	if	(mod.outcome.type%in%"disc")	{
		res <- disc.tests(db = db, 
				sensitive = sensitive, 
				obs.outcome = obs.outcome, 
				mod.outcome = mod.outcome, 
				conditional = conditional, 
				p.value = p.value)
		} else {
		res <- cont.tests(db = db, 
				sensitive = sensitive, 
				obs.outcome = obs.outcome, 
				mod.outcome = mod.outcome, 
				conditional = conditional, 
				p.value = p.value)
		}
return(res)
}

#discrete model outcome
disc.tests <- function(db, sensitive, obs.outcome, mod.outcome, conditional, p.value) {
	#statistical parity 
	sp <- disc.test.aux(db = db, 
			sensitive = sensitive, 
			mod.outcome = mod.outcome, 
			p.value = p.value)
	#conditional statistical parity
	csp <- NULL
	if	(!is.null(conditional)) {
		uc <- unique(conditional)
		ucl <- length(uc)
		csp <- vector("list", ucl)
		for	(i in 1:ucl) {
			conditional.l <- uc[i]
			csp.l <- disc.test.aux.c(db = db, 
						sensitive = sensitive, 
						mod.outcome = mod.outcome, 
						conditional = conditional.l, 
						p.value = p.value)
			csp.l <- cbind.data.frame(attribute = conditional.l, csp.l)
			csp[[i]] <- csp.l
			}
		csp <- bind_rows(csp)
		}
	#equal odds, equal opportunity, predictive equallity 
	eo <- disc.test.aux.c(db = db, 
			sensitive = sensitive, 
			mod.outcome = mod.outcome, 
			conditional = obs.outcome, 
			p.value = p.value)
	eo <- cbind.data.frame(metric = c("equal opportunity", "predictive equality", "equal odds"),
				     eo)	
	res <- list(SP = sp, CSP = csp, EO = eo)
return(res)
}
disc.test.aux <- function(db, sensitive, mod.outcome, p.value) {
	tbl <- table(db[, c(mod.outcome, sensitive)])
	test.res <-  chisq.test(x = tbl, correct = FALSE)
	df <- unname(test.res$parameter)
	statistic <- unname(test.res$statistic)
	p.val <- test.res$p.value
	res <- data.frame(nobs = sum(tbl), 
			statistic = statistic, 
			df = df, 
			p.val = p.val, 
			res = ifelse(p.val < p.value, "H0: independence assumption rejected.",
						      "H0: independence assumption cannot be rejected."))
return(res)
}
disc.test.aux.c <- function(db, sensitive, mod.outcome, conditional, p.value) {
	ucv <- sort(unique(db[, conditional]))
	ucvl <- length(ucv)
	res <- vector("list", ucvl)
	for	(i in 1:ucvl) {
		uc.l <- ucv[i]
		res.l <- disc.test.aux(db = db[db[, conditional]%in%uc.l, ], 
					sensitive = sensitive, 
					mod.outcome = mod.outcome, 
					p.value = p.value)
		res[[i]] <- cbind.data.frame(modality = as.character(uc.l), res.l)
		}
	res <- bind_rows(res)
	#overall test
	df.o <- sum(res$df)
	statistic.o <- sum(res$statistic)
	p.val.o <- pchisq(statistic.o, df.o, lower.tail = FALSE)
	res.o <- data.frame(modality = "overall",
			nobs = sum(res$nobs),
			statistic = statistic.o,
			df = df.o,
			p.val = p.val.o, 
			res = ifelse(p.val.o < p.value, "H0: independence assumption rejected.",
							"H0: independence assumption cannot be rejected."))
	res <- bind_rows(res, res.o)
return(res)
}
#continuous model outcome
cont.tests <- function(db, sensitive, obs.outcome, mod.outcome, conditional, p.value) {
	#statistical parity 
	sp <- cont.test.aux(db = db, 
			sensitive = sensitive, 
			mod.outcome = mod.outcome, 
			p.value = p.value)
	#conditional statistical parity
	csp <- NULL
	if	(!is.null(conditional)) {
		uc <- unique(conditional)
		ucl <- length(uc)
		csp <- vector("list", ucl)
		for	(i in 1:ucl) {
			conditional.l <- uc[i]
			csp.l <- cont.test.aux.c(db = db, 
						sensitive = sensitive, 
						mod.outcome = mod.outcome, 
						conditional = conditional.l, 
						p.value = p.value)
			csp.l <- cbind.data.frame(attribute = conditional.l, csp.l)
			csp[[i]] <- csp.l
			}
		csp <- bind_rows(csp)
		}
	#equal odds, equal opportunity, predictive equallity 
	eo <- cont.test.aux.c(db = db, 
				sensitive = sensitive, 
				mod.outcome = mod.outcome, 
				conditional = obs.outcome, 
				p.value = p.value)
	eo <- cbind.data.frame(metric = c("equal opportunity", "predictive equality", "equal odds"),
				     eo)	
	res <- list(SP = sp, CSP = csp, EO = eo)
return(res)
}
cont.test.aux <- function(db, sensitive, mod.outcome, p.value) {
	mod.frm <- paste0(mod.outcome, " ~ ", "as.factor(", eval(parse(text = "sensitive")), ")")	
	mod <- lm(formula = as.formula(mod.frm), data = db)
	res <- wald.test.aux(model = mod)
	res <- cbind.data.frame(res, 
				res = ifelse(res$p.val < p.value, "H0: average equality rejected.",
								"H0: average equality cannot be rejected."))
return(res)
}
wald.test.aux <- function(model) {
	vcm <- vcov(model)
	coef.est <- coef(model)
	terms <- 2:length(coef.est)
	if	(any(is.na(coef.est[terms]))) {return(1)}
	w <- length(terms)
	h0 <- rep(0, w)
	l <- matrix(rep(0, length(coef.est) * w), ncol = length(coef.est))
	for	(i in 1:w) {l[i, terms[i]] <- 1}
	f <- l%*%coef.est
	stat <- t(f - h0)%*%solve(l%*%vcm%*%t(l))%*%(f - h0)
	p.val <- pchisq(c(stat), df = w, lower.tail = FALSE)
	res <- data.frame(nobs = length(model$residuals), statistic = stat, df = w, p.val = p.val)
return(res)
}
cont.test.aux.c <- function(db, sensitive, mod.outcome, conditional, p.value) {
	ucv <- sort(unique(db[, conditional]))
	ucvl <- length(ucv)
	res <- vector("list", ucvl)
	for	(i in 1:ucvl) {
		uc.l <- ucv[i]
		res.l <- cont.test.aux(db = db[db[, conditional]%in%uc.l, ], 
					sensitive = sensitive, 
					mod.outcome = mod.outcome, 
					p.value = p.value)
		res[[i]] <- cbind.data.frame(modality = as.character(uc.l), res.l)
		}
	res <- bind_rows(res)
	#overall test
	df.o <- sum(res$df)
	statistic.o <- sum(res$statistic)
	p.val.o <- pchisq(statistic.o, df.o, lower.tail = FALSE)
	res.o <- data.frame(modality = "overall",
			nobs = sum(res$nobs),
			statistic = statistic.o,
			df = df.o,
			p.val = p.val.o, 
			res = ifelse(p.val.o < p.value, "H0: average equality rejected.",
							"H0: average equality cannot be rejected."))
	res <- bind_rows(res, res.o)
return(res)
}

Any scripts or data that you put into this service are public.

PDtoolkit documentation built on Sept. 20, 2023, 9:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PDtoolkit
Collection of Tools for PD Rating Model Development and Validation

R/33_FAIRNESS_VLD.R
In PDtoolkit: Collection of Tools for PD Rating Model Development and Validation

Defines functions cont.test.aux.c wald.test.aux cont.test.aux cont.tests disc.test.aux.c disc.test.aux disc.tests fairness.vld

Documented in fairness.vld

Try the PDtoolkit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PDtoolkit Collection of Tools for PD Rating Model Development and Validation

R/33_FAIRNESS_VLD.R In PDtoolkit: Collection of Tools for PD Rating Model Development and Validation

Defines functions cont.test.aux.c wald.test.aux cont.test.aux cont.tests disc.test.aux.c disc.test.aux disc.tests fairness.vld

Documented in fairness.vld

Try the PDtoolkit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PDtoolkit
Collection of Tools for PD Rating Model Development and Validation

R/33_FAIRNESS_VLD.R
In PDtoolkit: Collection of Tools for PD Rating Model Development and Validation