statisfactory: Statistical and Geometrical Tools

Documented in makeFormulae

#' Make all possible formula
#'
#' This functions creates a list of formulae that contain all possible linear, quadratic, and two-way interaction terms from individual terms in an object of class \code{formula}. The formulae respect marginality conditions (i.e., they will always include lower-order terms if higher-order terms are included in a formula). Note that if there are more than several terms (i.e., >=3) and interactions and/or quadratic terms are desired, then formula generation may take a long time.
#'
#' @param formula 	A \code{formula} object with \emph{just} linear terms.
#' @param intercept Logical: If \code{TRUE} (default) then all models include an intercept.  If \code{FALSE} then then formula will specify that regression occurs through the origin (e.g., \code{y ~ -1 + etc.})
#' @param interceptOnly Logical: If \code{TRUE} then an intercept-only model is included in final set.
#' @param linearOnly Logical: If \code{TRUE} (default) then models with only linear terms are included in final set (plus other kinds of models if desired).
#' @param quad Logical: If \code{TRUE} (default), then include quadratic terms.
#' @param ia Logical: If \code{TRUE} (default), then include 2-way interaction terms.
#' @param verboten Character vector of terms that should not appear in the models. Ignored if \code{NULL} (default). You can use this argument, for example, to exclude specific interactions (e.g., \code{'x1:x2'}, but also include the converse, \code{'x2:x1'}), or power terms (e.g., \code{'I(x1^2)'}). Note that to ensure proper matching, you need to use a double backslash in front of each parenthesis and caret (\code(^)) character.
#' @param verbotenCombos List of lists: Used to specify specific combinations of terms that should not occur together. See \emph{Details} below. Ignored if \code{NULL} (default).
#' @param minTerms Either a positive integer representing the minimum number of terms required to be in a model, \emph{or} \code{NULL} (default) in which case the smallest model can have just one term.
#' @param maxTerms Either a positive integer representing the maximum number of terms allowed to be in a model, \emph{or} \code{NULL} (default) in which case there is no practical limit on the number of terms in a model.
#' @param returnFx Function used to generate the class of the output objects. Sensible functions in include \code{\link[stats]{as.formula}} (default) or \code{\link{as.character}}.
#' @param verbose Logical: If \code{TRUE} then display progress. Default is \code{FALSE}.
#'
#' @return A vector of formulae.
#'
#' @details  The argument \code{verbotenCombos} can be used to specify variables or terms that should not occur in the same formula. The argument \code{verbotenCombos} is composed of a list of lists. Each sublist comprises names of two variables or terms stated as characters followed by two logical values (\code{TRUE}/\code{FALSE}). The second variable/term is removed from the model if the first is in the model. If the first logical value is \code{TRUE} then the second variable/term is removed if the first variable appears alone in the formula (e.g., not in an interaction with another variable). If the first logical value is \code{FALSE} then the second variable/term is removed if the first variable/term appears in any term (e.g., as an interaction with another term).
#' Examples: \itemize{
#' \item \code{verbotenCombos=list(list('x1', 'x2', TRUE, TRUE))}: Removes \code{x2} if \code{x1} occurs in the model as a linear term.
#' \item \code{verbotenCombos=list(list('x1', 'x2', FALSE, TRUE))}: Removes the linear term \code{x2} if \code{x1} occurrs in \emph{any} term in the model.
#' \item \code{verbotenCombos=list(list('x1', 'x2', TRUE, FALSE))}: Removes \emph{any} term with \code{x2} if the linear term \code{x1} occurrs in the model.
#' \item \code{verbotenCombos=list(list('x1', 'x2', FALSE, FALSE))}: Removes any term with \code{x2} if any term has \code{x1}.
#' }
#' Quadratic terms and interaction terms can also be used, so: \itemize{
#' \item \code{verbotenCombos=list(list('x1', 'x1:x2', TRUE, TRUE))}: Removes \code{x1:x2} if \code{x1} were in the model.
#' \item \code{verbotenCombos=list(list('x1', 'I(x2^2)', TRUE, TRUE))}: Removes \code{I(x2^2)} if \code{x1} occurs in the model.
#' }
#' Note that inexact matching can remove terms incorrectly if inexact matches exist between names of terms or variables.  For example, if using an inexact match, then \code{verbotenCombos(list('x1', 'x2', FALSE, FALSE))} will find any term that has an \code{x1} (e.g., \code{x11}) and if it exists, remove any term with an \code{x2} (e.g., \code{x25}). Note that reciprocally removing predictors makes little sense since, for example \code{list(list('x1', 'x2', FALSE, FALSE), list('x2', 'x1', FALSE, FALSE))} removes all formulae with \code{x2} if \code{x1} appears then tries to find any models with \code{x2} that have \code{x1} (of which there will be none after the first set is removed).
#'
#' @examples
#'
#' makeFormulae(y ~ x1 + x2 + x3, maxTerms=3)
#' makeFormulae(y ~ x1 + x2 + x3, ia=FALSE, maxTerms=3)
#' verboten <- c('x1:x2', 'I(x1^2)')
#' makeFormulae(y ~ x1 + x2 + x3, verboten=verboten, maxTerms=3)
#'
#' makeFormulae(y ~ x1 + x2 + x3, maxTerms=3)
#' verbotenCombos <- list(list('x1', 'x2', TRUE, TRUE))
#' makeFormulae(y ~ x1 + x2 + x3, verbotenCombos=verbotenCombos, maxTerms=3)
#'
#' @export
makeFormulae <- function(
	formula,
	intercept = TRUE,
	interceptOnly = TRUE,
	linearOnly = TRUE,
	quad = TRUE,
	ia = TRUE,
	verboten = NULL,
	verbotenCombos = NULL,
	minTerms = NULL,
	maxTerms = NULL,
	returnFx = stats::as.formula,
	verbose = FALSE
) {

	# get response and predictor(s)
	resp <- all.vars(formula)[1]
	preds <- all.vars(formula)
	preds <- preds[2:length(preds)]

	# stop making candidate models when this number of terms have been reached
	stopAt <- if (is.null(maxTerms)) { length(preds) } else { min(length(preds), maxTerms) }

	### linear term-only models
	###########################

		if (verbose) omnibus::say('Making formulae with just linear terms...')

		linearModels <- list()

		for (countTerms in 1:stopAt) {

			modelAsList <- apply(utils::combn(preds, countTerms), 2, as.list)
			for (count in seq_along(modelAsList)) linearModels[[length(linearModels) + 1]] <- unlist(modelAsList[[count]])

		}

		if (!is.null(verboten)) linearModels <- .removeVerbotenVariables(linearModels, verboten=verboten)
		if (!is.null(verbotenCombos)) linearModels <- .removeVerbotenVariableCombos(linearModels, verbotenCombos=verbotenCombos)
		models <- if (linearOnly) { linearModels } else { list() }

	### quadratic models
	####################

		if (quad) {

			quadModels <- list()

			### all quadratic models with just linear terms that appear also as quadratic terms
			###################################################################################

				if (verbose) omnibus::say('Making formulae with just linear terms that appear also as quadratic terms...')

				for (countTerms in 1:stopAt) {

					modelAsList <- apply(utils::combn(preds, countTerms), 2, as.list)
					for (count in seq_along(modelAsList)) quadModels[[length(quadModels) + 1]] <-
						c(unlist(modelAsList[[count]]), paste('I(', unlist(modelAsList[[count]]), '^2)', sep=''))

				}

				# remove models above a given order
				if (!is.null(maxTerms)) {

					numTerms <- unlist(lapply(quadModels, length))
					wantModels <- which(numTerms <= maxTerms)

					selectModels <- list()
					for (want in wantModels) selectModels[[length(selectModels) + 1]] <- quadModels[[want]]
					quadModels <- selectModels

				}

				# remove models with forbidden term combinations
				if (!is.null(verboten)) quadModels <- .removeVerbotenVariables(quadModels, verboten=verboten)
				if (!is.null(verbotenCombos)) quadModels <- .removeVerbotenVariableCombos(quadModels, verbotenCombos=verbotenCombos)

			### models with linear terms that do not also appear as quadratic terms
			#######################################################################

				if (verbose) cat('Making formulae with quadratic and appropriate linear\n
								  terms plus linear terms that do not also appear as\n
								  quadratic terms...')

				for (countQuad in seq_along(quadModels)) {

					for (countLinear in seq_along(linearModels)) {

						quadModels[[length(quadModels) + 1]] <- unique(c(linearModels[[countLinear]], quadModels[[countQuad]]))

					}

				}

				# remove redundant models
				quadModels <- .removeRedundantModels(quadModels)

				# remove models above a given order
				if (!is.null(maxTerms)) {

					numTerms <- unlist(lapply(quadModels, length))
					wantModels <- which(numTerms <= maxTerms)

					selectModels <- list()
					for (want in wantModels) selectModels[[length(selectModels) + 1]] <- quadModels[[want]]
					quadModels <- selectModels

				}

				# remove models with forbidden term combinations
				if (!is.null(verboten)) quadModels <- .removeVerbotenVariables(quadModels, verboten=verboten)
				if (!is.null(verbotenCombos)) quadModels <- .removeVerbotenVariableCombos(quadModels, verbotenCombos=verbotenCombos)

			# remember
			models <- c(models, quadModels)

		}

	### models with 2-ways interactions
	###################################

		iaModels <- list()

		if (ia & length(preds) > 1) {

			# all models with 2-way interactions where linear terms also appear in interaction terms
			if (verbose) omnibus::say('Making formulae with two-way interaction and appropriate linear terms...')

			for (countTerms in 2:stopAt) {

				theseTerms <- utils::combn(preds, countTerms)

				linearRows <- nrow(theseTerms)

				for (countTermOne in 1:(linearRows - 1)) {

					for (countTermTwo in (countTermOne + 1):linearRows) {

						theseTerms <- rbind(theseTerms, paste(theseTerms[countTermOne, ], theseTerms[countTermTwo, ], sep=':'))

					}

				}

				for (countModels in 1:ncol(theseTerms)) iaModels[[length(iaModels) + 1]] <- c(theseTerms[ , countModels])

			}

			# remove models above a given order
			if (!is.null(maxTerms)) {

				numTerms <- unlist(lapply(iaModels, length))
				wantModels <- which(numTerms <= maxTerms)

				selectModels <- list()
				for (want in wantModels) selectModels[[length(selectModels) + 1]] <- iaModels[[want]]
				iaModels <- selectModels

			}

			# if there are any IA models
			if (length(iaModels) > 0) {

				# remove models with forbidden term combinations
				if (!is.null(verboten)) iaModels <- .removeVerbotenVariables(iaModels, verboten=verboten)
				if (!is.null(verbotenCombos)) iaModels <- .removeVerbotenVariableCombos(iaModels, verbotenCombos=verbotenCombos)

				# all models with 2-way interaction terms with appropriate linear terms and some other linear terms that are not part of interaction terms
				if (verbose) omnibus::say('Making formulae with two-way interaction and appropriate linear terms plus other linear terms that are not part of interaction terms...')

				for (countIa in seq_along(iaModels)) {

					for (countLinear in seq_along(linearModels)) {

						iaModels[[length(iaModels) + 1]] <- unique(c(linearModels[[countLinear]], iaModels[[countIa]]))

					}

				}

				# remove redundant models
				iaModels <- .removeRedundantModels(iaModels)

				# remove models above a given order
				if (!is.null(maxTerms)) {

					numTerms <- unlist(lapply(iaModels, length))
					wantModels <- which(numTerms <= maxTerms)

					selectModels <- list()
					for (want in wantModels) selectModels[[length(selectModels) + 1]] <- iaModels[[want]]
					iaModels <- selectModels

				}

				# remove models with forbidden term combinations
				if (!is.null(verboten)) iaModels <- .removeVerbotenVariables(iaModels, verboten=verboten)
				if (!is.null(verbotenCombos)) iaModels <- .removeVerbotenVariableCombos(iaModels, verbotenCombos=verbotenCombos)

				# remember
				for (count in seq_along(iaModels)) models[[length(models) + 1]] <- iaModels[[count]]

			}

		}

	### models with quadratic and interaction terms
	###############################################

		if (quad & ia & length(iaModels) > 0) {

			iaQuadModels <- list()

			# add quadratic and interaction models
			if (verbose) omnibus::say('Making formulae with all possible two-way interaction, quadratic, and appropriate linear terms...')

			for (countQuad in seq_along(quadModels)) {

				for (countIa in seq_along(iaModels)) {

					iaQuadModels[[length(iaQuadModels) + 1]] <- c(quadModels[[countQuad]], iaModels[[countIa]])

				}

			}

			# remove redundant models
			iaQuadModels <- .removeRedundantModels(iaQuadModels)

			# remove models above a given order
			if (!is.null(maxTerms)) {

				numTerms <- unlist(lapply(iaQuadModels, length))
				wantModels <- which(numTerms <= maxTerms)

				selectModels <- list()
				for (want in wantModels) selectModels[[length(selectModels) + 1]] <- iaQuadModels[[want]]
				iaQuadModels <- selectModels

			}

			# remove models with forbidden term combinations
			if (!is.null(verboten) & length(iaQuadModels) > 0) iaQuadModels <- .removeVerbotenVariables(iaQuadModels, verboten=verboten)
			if (!is.null(verbotenCombos) & length(iaQuadModels) > 0) iaQuadModels <- .removeVerbotenVariableCombos(iaQuadModels, verbotenCombos=verbotenCombos)

			### add linear terms that may not be in combined quad and interaction models
			############################################################################

			if (verbose) omnibus::say('Making formulae with all possible two-way interaction, quadratic, and appropriate linear terms plus linear terms not already in formula...')

			# if there are still any IA quadratic models
			if (length(iaQuadModels) > 0) {

				for (countIaQuad in seq_along(iaQuadModels)) {

					for (countLinear in seq_along(linearModels)) {

						iaQuadModels[[length(iaQuadModels) + 1]] <- c(linearModels[[countLinear]], iaQuadModels[[countIaQuad]])

					}

				}

				# remove redundant models
				iaQuadModels <- .removeRedundantModels(iaQuadModels)

				# remove models with forbidden term combinations
				if (!is.null(verboten)) iaQuadModels <- .removeVerbotenVariables(iaQuadModels, verboten=verboten)
				if (!is.null(verbotenCombos)) iaQuadModels <- .removeVerbotenVariableCombos(iaQuadModels, verbotenCombos=verbotenCombos)

				# remember
				for (count in seq_along(iaQuadModels)) models[[length(models) + 1]] <- iaQuadModels[[count]]

			}

		}

	### remove models below a desired order
	if (!is.null(minTerms)) {

		numTerms <- unlist(lapply(models, length))
		wantModels <- which(numTerms >= minTerms)

		selectModels <- list()
		for (want in wantModels) selectModels[[length(selectModels) + 1]] <- models[[want]]
		models <- selectModels

	}

	### remove models above a desired order
	if (length(models) > 0 & !is.null(maxTerms)) {

		numTerms <- unlist(lapply(models, length))
		wantModels <- which(numTerms <= maxTerms)

		selectModels <- list()
		for (want in wantModels) selectModels[[length(selectModels) + 1]] <- models[[want]]
		models <- selectModels

	}

	### remove redundants
	if (length(models) > 0) models <- .removeRedundantModels(models)

	### make model formulae
	if (length(models) > 0) {

		forms <- list()
		for (countModels in seq_along(models)) {
			forms[[countModels]] <- paste(resp, '~', ifelse(intercept, '1 +', '-1 + '), paste(models[[countModels]], collapse=' + '))
		}

		if (interceptOnly) forms[[length(forms) + 1]] <- paste(resp, '~', 1)

		forms <- sapply(forms, returnFx)

	} else {
		forms <- returnFx(character())
	}

	forms

}


#' Remove formula with unwanted terms.
#'
#' This function takes as an argument a vector of character strings representing formulae and returns a potentially shortened list without formulae containing a certain term.
#'
#' @param forms Vector of characters each representing a formula.
#' @param verboten Either \code{NULL} (default) in which case \code{forms} is returned without any manipulation. Alternatively, this is a character list of terms that are not allowed to appear in any model in \code{forms}. Models with these terms are removed from \code{forms}. Note that the order of variables in interaction terms does not matter (e.g., \code{x1:x2} will cause the removal of models with this term verbatim as well as \code{x2:x1}). All possible permutations of three-way interaction terms are treated similarly.
#' @return A list of character elements representing formulae.
#' @keywords internal
#' @noRd
.removeVerbotenVariables <- compiler::cmpfun(function(
	forms,
	verboten
) {

	# ensure each variant of an interaction term is represented
	for (i in seq_along(verboten)) {
		if (grepl(verboten[i], pattern=':')) {

			splitTerms <- strsplit(verboten[i], split=':')[[1]]
			swappedTerms <- if (length(splitTerms) == 2) {
				paste0(splitTerms[2], ':', splitTerms[1])
			} else if (length(splitTerms) == 3) {
				c(
					paste0(splitTerms[1], ':', splitTerms[3], ':', splitTerms[2]),
					paste0(splitTerms[2], ':', splitTerms[1], ':', splitTerms[3]),
					paste0(splitTerms[2], ':', splitTerms[3], ':', splitTerms[1]),
					paste0(splitTerms[3], ':', splitTerms[1], ':', splitTerms[2]),
					paste0(splitTerms[3], ':', splitTerms[2], ':', splitTerms[1])
				)
			}

			verboten <- c(verboten, swappedTerms)

		}

	}

	verboten <- unique(verboten)

	# remove formulae with undesired term(s)
	removeThese <- numeric()
	for (countModel in seq_along(forms)) {
		if (any(forms[[countModel]] %in% verboten)) removeThese <- c(removeThese, countModel)
	}
	
	if (length(removeThese) > 0) forms <- forms[-removeThese]
	forms

})

#' Remove formula with unwanted combinations of variables.
#'
#' This function takes as an argument a list of character strings representing formulae and returns a potentially shortened list without formulae containing certain combinations of variables.
#' @param forms List of characters each representing a formula.
#' @param verbotenCombos Either \code{NULL} (default) in which case \code{forms} is returned without any manipulation. Alternatively, this argument can be used to specify variables or terms that should not occur in the same formula. The argument \code{verbotenCombos} is composed of a list of lists. Each sublist comprises names of two variables or terms stated as characters followed by two logical values (\code{TRUE}/\code{FALSE}). The second variable/term is removed from the model if the first is in the model. If the first logical value is \code{TRUE} then the second variable/term is removed if the first variable appears alone in the formula (e.g., not in an interaction with another variable). If the first logical value is \code{FALSE} then the second variable/term is removed if the first variable/term appears in any term (e.g., as an interaction with another term). 
#' Examples: \itemize{
#' \item \code{verbotenCombos=list(list('x1', 'x2', TRUE, TRUE))}: Removes \code{x2} if \code{x1} occurs in the model as a linear term.
#' \item \code{verbotenCombos=list(list('x1', 'x2', FALSE, TRUE))}: Removes the linear term \code{x2} if \code{x1} occurred in \emph{any} term in the model.
#' \item \code{verbotenCombos=list(list('x1', 'x2', TRUE, FALSE))}: Removes \emph{any} term with \code{x2} if the linear term \code{x1} occurred in the model.
#' \item \code{verbotenCombos=list(list('x1', 'x2', FALSE, FALSE))}: Removes any term with \code{x2} if any term had \code{x1}.
#' }
#' Quadratic terms and interaction terms can also be stated, so: \itemize{
#' \item \code{verbotenCombos=list(list('x1', 'x1:x2', TRUE, TRUE))}: Removes \code{x1:x2} if \code{x1} were in the model.
#' \item \code{verbotenCombos=list(list('x1', 'I(x2^2)', TRUE, TRUE))}: Removes \code{I(x2^2)}.
#' }
#' Note that inexact matching can remove terms incorrectly if inexact matches exist between names of terms or variables.  For example, if using an inexact match, then \code{verbotenCombos(list('x1', 'x2', FALSE, FALSE))} will find any term that has an \code{x1} (e.g., \code{x11}) and if it exists, remove any term with an \code{x2} (e.g., \code{x25}). Note that reciprocally removing predictors makes little sense since, for example \code{list(list('x1', 'x2', FALSE, FALSE), list('x2', 'x1', FALSE, FALSE))} removes all formulae with \code{x2} if \code{x1} appears then tries to find any models with \code{x2} that have \code{x1} (of which there are none).
#' @return A list of character elements representing formulae.
#' @keywords internal
.removeVerbotenVariableCombos <- compiler::cmpfun(function(
	forms,
	verbotenCombos
) {

	# stores indices of models to be removed
	removeThese <- numeric()

	for (countModel in seq_along(forms)) {

		for (countVerboten in seq_along(verbotenCombos)) {

			# is matching exact or not?
			exactMatch1 <- unlist(verbotenCombos[[countVerboten]])[3]=='TRUE'
			exactMatch2 <- unlist(verbotenCombos[[countVerboten]])[4]=='TRUE'
	
			# does the exact term appear in the formula
			logicalExact1 <- forms[[countModel]] %in% verbotenCombos[[countVerboten]][1]
			logicalExact2 <- forms[[countModel]] %in% verbotenCombos[[countVerboten]][2]
			
			exactTerm1 <- any(logicalExact1)
			exactTerm2 <- any(logicalExact2)
			
			# if so what is its index?
			exactIndex1 <- which(logicalExact1)
			exactIndex2 <- which(logicalExact2)
			
			# does a form of the term appear in the formula?
			logicalLoose1 <- grepl(forms[[countModel]], pattern=verbotenCombos[[countVerboten]][1])
			logicalLoose2 <- grepl(forms[[countModel]], pattern=verbotenCombos[[countVerboten]][2])
			
			looseTerm1 <- any(logicalLoose1)
			looseTerm2 <- any(logicalLoose2)
			
			# if so what is its index?
			looseIndex1 <- which(logicalLoose1)
			looseIndex2 <- which(logicalLoose2)
			
			# want exact matches for both terms
			if (exactMatch1 & exactMatch2) {
								
				if (exactTerm1 & exactTerm2) removeThese <- c(removeThese, countModel)
	
			# want exact match for first term, loose match for second
			} else if (exactMatch1 & !exactMatch2) {
	
				if (exactTerm1 & looseTerm2) removeThese <- c(removeThese, countModel)
				
			# want exact match for second term, loose match for first
			} else if (!exactMatch1 & exactMatch2) {
	
				if (looseTerm1 & exactTerm2) removeThese <- c(removeThese, countModel)
				
			# want loose match for both terms
			} else if (!exactMatch1 & !exactMatch2) {

				if (looseTerm1 & looseTerm2) removeThese <- c(removeThese, countModel)
				
			}
	
		} # next verbotenCombos list
	
	} # next model

	if (length(removeThese) > 0) forms <- forms[-removeThese]
	
	forms
	
})

#' Remove redundant model forms from a list of models
#'
#' This function takes as an argument a list of character vectors. Each set of character vectors represents terms in a formula, and each element of a specific term in that formula. It returns a possibly shortened list with vectors culled.
#' @param formList List of character variables each in formula format.
#' @return List.
#' @keywords internal
.removeRedundantModels <- compiler::cmpfun(function(
	formList
) {

	formList <- lapply(formList, FUN=function(x) sort(unique(x)))
	numTerms <- sapply(formList, length)
	
	keep <- list()
	
	for (thisTerms in unique(numTerms)) {
	
		listTheseTerms <- formList[which(numTerms==thisTerms)]
		listTheseTerms <- if (thisTerms==1) { matrix(sapply(listTheseTerms, paste), nrow=1) } else { sapply(listTheseTerms, paste) }
		newTerms <- rep('eq', ncol(listTheseTerms))
		for (countRow in 1:nrow(listTheseTerms)) newTerms <- paste(newTerms, listTheseTerms[countRow, ])
		uniqueTerms <- unique(listTheseTerms, MARGIN=2)
		for (countCol in 1:ncol(uniqueTerms)) keep[[length(keep) + 1]] <- c(uniqueTerms[ , countCol])
		
	}
	
	keep
	
})