#' Variable selection with Permuted Inclusion Criterion
#'
#' Method of forward variable selection based on deviance for Bradley-Terry
#' models using pairwise ranking data. The selection procedure consists of two steps,
#' first, permuting the variables from the original \code{predictors} with
#' \code{n.iteractions}, then performing a forward selection to retain the
#' \code{predictors} with highest contribution to the model, see details.
#'
#' @family model selection functions
#' @author Jonathan Steinke and Kauê de Sousa
#' @param contests a data frame with pairwise binary contests with these variables
#' 'id','player1','player2','win1','win2'; in that order. The id should be equivalent
#' to the index of each row in \code{predictors}
#' @param predictors a data frame with player-specific variables with row indices that
#' should match with the ids in \code{contests}. An id is not required, only the
#' predictor variables, the ids are the index for each row
#' @param n.iterations integer, number of iterations to compute
#' @param seed integer, the seed for random number generation. If NULL (the default),
#' \pkg{gosset} will set the seed randomly
#' @param ... additional arguments passed to \pkg{BradleyTerry2} methods
#' @return an object of class \code{gosset_btpermute} with the final \code{BTm()} model,
#' selected variables, seeds (random numbers) used for permutations and deviances
#' @seealso \code{\link{set_binomialfreq}}, \code{\link[BradleyTerry2]{BTm}}
#' @details
#' The selection procedure consists of two steps. In the first step, \code{btpermute}
#' adds to the set of original (candidate) \code{predictors} variables
#' an additional set of 'fake', permuted variables. This set of permuted
#' \code{predictors} is created
#' by assigning to each ranking the variables from another, randomly selected
#' ranking. The permuted variables are not expected to have any predictive
#' power for pairwise rankings. In the second step, \code{btpermute} adds
#' predictors to the Bradley-Terry model in a forward selection procedure.
#' Each \code{predictors} (real and permuted) is added to the null model
#' individually, and \code{btpermute} retains which variable reduces model
#' deviance most strongly. The two-step process is replicated \code{n} times
#' with argument \code{n.iterations}. At each iteration, a new random permutation
#' is generated and all variables are tested. Replicability can be controlled
#' using argument \code{seed}. Across the n \code{n.iterations}, the function
#' identifies the predictor that appeared most often as the most deviance-reducing
#' one. When this is a real variable, it is constantly added to the model and
#' the forward selection procedure moves on – again creating new permutations,
#' adding real and fake variables individually, and examining model deviance.
#' Variable selection stops when a permuted variable is found to be most
#' frequently the most deviance-reducing \code{predictors} across \code{n.iterations}.
#' In turn, variable selection continuous as long as any real variable has stronger
#' explanatory power for pairwise rankings than the random variables.
#'
#' @references
#' Lysen, S. (2009) Permuted inclusion criterion: A variable selection technique.
#' University of Pennsylvania
#'
#' @examplesIf interactive()
#'
#' require("BradleyTerry2")
#'
#' data("kenyachoice", package = "gosset")
#'
#' mod <- btpermute(contests = kenyachoice$contests,
#' predictors = kenyachoice$predictors,
#' n.iterations = 10,
#' seed = 1)
#'
#' mod
#'
#' @importFrom BradleyTerry2 BTm
#' @importFrom stats deviance formula
#' @export
btpermute <- function(contests = NULL,
predictors = NULL,
n.iterations = 15,
seed = NULL,
...){
# define/set a seed to ensure reproducibility
if (is.null(seed)) {
seed <- runif(1, 1, 99999)
}
set.seed(seed)
seeds <- as.integer(runif(n.iterations, 1, 99999))
# contests should be a data.frame with 5 columns
# if not, then stop
if (isFALSE(dim(contests)[[2]] == 5)) {
stop("'contests' should be a data.frame with 5 columns with these variables:
c('id','player1','player2','win1','win2')\n")
}
# change names in contest
origContsnames <- names(contests)
names(contests) <- c("ObsID","Item1","Item2","Win1","Win2")
# check if players are factors
not_factor <- any(c(!is.factor(contests$Item1), !is.factor(contests$Item2)))
if (isTRUE(not_factor)) {
stop("'player1' and 'player2' should be factors with the same levels\n")
}
# check if players are factors with same level
items <- unique(contests$Item1)
items2 <- unique(contests$Item2)
not_factor <- any(levels(items) != levels(items2))
if (isTRUE(not_factor)) {
stop("'player1' and 'player2' should be factors with the same levels\n")
}
dat <- list()
dat[[1]] <- contests
dat[[2]] <- predictors
dat[[3]] <- as.data.frame(diag(length(items)))
# 'items' need to be assigned to the .GlobalEnv
# to be processed by BTm
#assign("items", items, envir = .GlobalEnv)
items <<- items
Win1 <- contests$Win1
Win2 <- contests$Win2
Item1 <- contests$Item1
Item2 <- contests$Item2
nvar <- dim(predictors)[[2]]
nobs <- dim(predictors)[[1]]
origVarnames <- names(dat[[2]])
origVarnames <- c(origVarnames, rep("Permuted variable", times = nvar))
Vars <- sprintf("Var%1d", seq_len(nvar))
names(dat[[2]]) <- Vars
rownames(dat[[3]]) <- items
colnames(dat[[3]]) <- items
permVars <- sprintf("PV%1d", seq_len(nvar))
allVars <- c(Vars,permVars)
# Calculating deviance of BT models
#first item in the item list is the reference item for BT models
refItem <- levels(items)[1]
# List of formula "extensions" to be added to the
# BTm models as variables are sequentially added.
nperm <- n.iterations + 1
BTmformula <- as.list(paste0("~ .. + ",
sprintf("Var%1d", seq_len(nperm)),
"[ObsID] * items[..]"))
BTmformula <- lapply(BTmformula, stats::as.formula)
# permutations formulas
BTmformulaPV <- as.list(paste0("~ .. + ",
sprintf("PV%1d", seq_len(nperm)),
"[ObsID] * items[..]"))
BTmformulaPV <- lapply(BTmformulaPV, stats::as.formula)
Covariates <- character()
deviances <- list()
selectedCovIndice <- integer()
d.VarOverview <- list()
selectedCovariates <- character()
for (l in seq_len(nvar)){
## this piece of code ensures that the loop terminates whenever a permuted
## variable would be selected for the model
if (l == 1) {
selectedCovIndex <- 0
}else{
selectedCovIndex <- match(Covariates[l-1], allVars)
}
selectedCovIndice[l-1] <- selectedCovIndex
# for the first variable, selectedCovIndex == 0
# thus, no variable is removed from the dataset
# if > nvar, then FirstCovariate is a PV and we stop here (break)
# exclude selected variable along with its permuted one
ifelse(
selectedCovIndex == 0,
allVars2 <- allVars,
ifelse(
selectedCovIndex > 0 & selectedCovIndex <= nvar,
allVars2 <- allVars[-c(selectedCovIndice,nvar + selectedCovIndice)],
break)
)
# Selecting covariates
# deviances at first round (to identify the first variable to include)
devl <- numeric(length = length(allVars2))
names(devl) <- allVars2
deviances[[l]] <- devl
# First, determining deviances of BTm models with individual *real* variables
varIndice <- seq_len(nvar)
indexCovariate <- match(Covariates, Vars)
if (length(indexCovariate) > 0) {
varIndice <- varIndice[-indexCovariate]
}
counter <- 0
if (length(Covariates) == 0) {
for (i in varIndice){
counter <- counter + 1
deviances[[l]][counter] <- stats::deviance(
BradleyTerry2::BTm(
outcome = cbind(Win1,Win2),
player1 = Item1,
player2 = Item2,
formula = BTmformula[[i]],
refcat = refItem,
data = dat,
...)
)
}
}
if (length(Covariates) > 0) {
for (i in varIndice) {
# the model sequentially tests all real variables (BTmformula[i])
# except for those that are already included in Covariates
# these are necessarily included in the model anyway
updatedFormula <- stats::as.formula(paste(
c(BTmformula[[i]], c(BTmformula[match(Covariates,allVars)])),
collapse="+")
)
counter <- counter + 1
deviances[[l]][counter] <- stats::deviance(
BradleyTerry2::BTm(
outcome = cbind(Win1,Win2),
player1 = Item1,
player2 = Item2,
formula = updatedFormula,
refcat = refItem,
data = dat,
...)
)
}
}
# Second, determining deviances of models with individual *permuted* variables
# stores the variable that caused greates deviance reduction,
# for each iteration in the loop
d.Var <- character(length = n.iterations)
devsPerIteration <- list()
message("Selecting variable #",l,"...\n")
for (j in seq_len(n.iterations)){
# permute = randomize rows, and create a new 'fake' data set
set.seed(seeds[j])
permRowOrder <- sample(rownames(dat[[2]]), replace = FALSE)
permutedData <- dat[[2]][permRowOrder, ]
names(permutedData) <- permVars
rownames(permutedData) <- seq_len(nobs)
# add the permuted data to the original data space
datPIC <- dat
datPIC[[2]] <- cbind.data.frame(dat[[2]], permutedData)
# data permuted, now fit models
# adds another half-filled vector to the devsPerIteration list at each iteration
devsPerIteration[[j]] <- deviances[[l]]
# again, omit permuted variables that correspond to already-included Covariates
for (k in ((length(varIndice)+1):length(allVars))[1:length(varIndice)]) {
# we can omit the same index (indexCovariate) because here we start at nvar+1
if (length(Covariates) == 0) {
# for the first Covariate (if length(Covariates)==0))
# the full dataset, including real and fake variables
devsPerIteration[[j]][k] <- stats::deviance(
BradleyTerry2::BTm(
outcome = cbind(Win1,Win2),
player1 = Item1,
player2 = Item2,
formula = BTmformulaPV[[k-length(varIndice)]],
refcat = refItem,
data = datPIC,
...)
)
}
if (length(Covariates) > 0) {
updatedFormula <- stats::formula(paste(
c(BTmformulaPV[[k - length(varIndice)]],
c(BTmformula[match(Covariates, allVars)])),
collapse="+"))
# when there is already on Covariate and we are adding to that
# ((if length(Covariates)!=0)))
devsPerIteration[[j]][k] <- stats::deviance(
BradleyTerry2::BTm(
outcome = cbind(Win1,Win2),
player1 = Item1,
player2 = Item2,
formula = updatedFormula,
refcat = refItem,
data = datPIC,
...)
)
}
}
# variable name corresponding to the smallest deviance value
d.Var[j] <- names(devsPerIteration[[j]])[devsPerIteration[[j]] %in%
min(devsPerIteration[[j]])]
}
#when a PV appears as often as a real variable, process stops
Covariates[l] <- names(table(d.Var))[table(d.Var) %in% max(table(d.Var))]
selectedCovariates[l] <- origVarnames[match(Covariates[l], allVars)]
}
selected <- selectedCovariates[-which(selectedCovariates == "Permuted variable")]
message("Selected ", toString(selected), "\n")
finalmodel <- stats::as.formula(paste(BTmformula[match(selected, origVarnames)],
collapse = " + "))
repl <- match(selected, origVarnames)
finalcall <- deparse(finalmodel, width.cutoff = 500)
for(i in seq_along(repl)) {
finalcall <- gsub(paste0("Var",repl[i]), selected[i], finalcall)
}
finalmodel <- stats::as.formula(finalcall)
names(dat[[2]]) <- names(predictors)
# fit final model with the selected variables
PICmodel <-
BradleyTerry2::BTm(
outcome = cbind(Win1, Win2),
player1 = Item1,
player2 = Item2,
formula = finalmodel,
refcat = refItem,
data = dat,
...)
result <- list(model = PICmodel,
formula = finalcall,
selected = as.character(selected),
seeds = seeds,
deviances = deviances)
class(result) <- union("gosset_btpermute", class(result))
return(result)
}
#' @method print gosset_btpermute
#' @export
print.gosset_btpermute <- function(x, ...) {
cat("Model formula:\n")
cat(x[["formula"]], "\n \n")
cat("Permutation estimates: \n")
p <- unlist(x[["model"]][c("aic","deviance",
"null.deviance",
"df.residual","df.null")])
names(p)[1] <- "AIC"
print(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.