Nothing
#' Genetic-Algorithm Best Subset Selection (GA / GAISL)
#'
#' @description
#' Runs a GA-based search over variable subsets using a user-specified
#' objective (default: \code{\link{subsetBIC}}) and returns a compact
#' \code{"gareg"} S4 result with \code{method = "subset"}.
#' The engine can be \code{\link[GA]{ga}} (single population) or
#' \code{\link[GA]{gaisl}} (islands), selected via \code{gaMethod}.
#'
#' @param y Numeric response vector (length \code{n}).
#' @param X Numeric matrix of candidate predictors (\code{n} rows by \code{p} columns).
#' @param ObjFunc Objective function or its name. Defaults to \code{\link{subsetBIC}}.
#' The objective must accept as its first argument a binary chromosome
#' (0/1 mask of length \code{p}) and may accept additional arguments passed via \code{...}.
#' By convention, \code{subsetBIC} returns \emph{negative} BIC, so the GA maximizes fitness.
#' @param gaMethod GA backend to call: \code{"ga"} or \code{"gaisl"} (functions from package \pkg{GA}),
#' or a GA-compatible function with the same interface as \code{\link[GA]{ga}}.
#' @param gacontrol Optional named list of GA engine controls (e.g., \code{popSize}, \code{maxiter},
#' \code{run}, \code{pcrossover}, \code{pmutation}, \code{elitism}, \code{seed}, \code{parallel},
#' \code{keepBest}, \code{monitor}, ...). These are passed to the GA engine, not to the objective.
#' @param monitoring Logical; if \code{TRUE}, prints a short message and (if not supplied in
#' \code{gacontrol}) sets \code{monitor = GA::gaMonitor} for live progress.
#' @param seed Optional RNG seed (convenience alias for \code{gacontrol$seed}).
#' @param ... Additional arguments forwarded to \code{ObjFunc} (not to the GA engine). For
#' \code{\link{subsetBIC}} these typically include \code{family}, \code{weights}, \code{offset},
#' and \code{control}.
#'
#' @details
#' The fitness passed to \pkg{GA} is \code{ObjFunc} itself. Because the engine expects
#' a function with signature \code{f(chrom, ...)}, your \code{ObjFunc} must interpret
#' \code{chrom} as a 0/1 mask over the columns of \code{X}. The function then computes a score
#' (e.g., negative BIC) using \code{y}, \code{X}, and any extra arguments supplied via \code{...}.
#'
#' With the default \code{\link{subsetBIC}}, the returned value is \code{-BIC}, so we set
#' \code{max = TRUE} in the GA call to maximize fitness. If you switch to an objective that
#' returns a quantity to \emph{minimize}, either negate it in your objective or change
#' the engine setting to \code{max = FALSE}.
#'
#' Engine controls belong in \code{gacontrol}; objective-specific options belong in \code{...}.
#' This separation prevents accidental name collisions between GA engine parameters and
#' objective arguments.
#'
#' @return An object of S4 class \code{"gareg"} (with \code{method = "subset"}) containing:
#' \itemize{
#' \item \code{call} – the matched call.
#' \item \code{N} – number of observations.
#' \item \code{objFunc} – the objective function used.
#' \item \code{gaMethod} – \code{"ga"} or \code{"gaisl"}.
#' \item \code{gaFit} – the GA fit object returned by \pkg{GA} (if your class allows it).
#' \item \code{featureNames} – column names of \code{X} (or empty).
#' \item \code{bestFitness} – best fitness value (\code{GA::ga}@fitnessValue).
#' \item \code{bestChrom} – \code{c(m, idx)}: number of selected variables and their indices.
#' \item \code{bestnumbsol} – \code{m}, number of selected variables.
#' \item \code{bestsol} – vector of selected column indices in \code{X}.
#' }
#'
#' @seealso
#' \code{\link{subsetBIC}},
#' \code{\link[GA]{ga}},
#' \code{\link[GA]{gaisl}}
#'
#' @examples
#' \donttest{
#' if (requireNamespace("GA", quietly = TRUE)) {
#' set.seed(1)
#' n <- 100
#' p <- 12
#' X <- matrix(rnorm(n * p), n, p)
#' y <- 1 + X[, 1] - 0.7 * X[, 4] + rnorm(n, sd = 0.5)
#'
#' # Default: subsetBIC (Gaussian – negative BIC), engine = GA::ga
#' fit1 <- gareg_subset(y, X,
#' gaMethod = "ga",
#' gacontrol = list(popSize = 60, maxiter = 80, run = 40, parallel = FALSE)
#' )
#' summary(fit1)
#'
#' # Island model: GA::gaisl
#' fit2 <- gareg_subset(y, X,
#' gaMethod = "gaisl",
#' gacontrol = list(popSize = 40, maxiter = 60, numIslands = 4, parallel = FALSE)
#' )
#' summary(fit2)
#'
#' # Logistic objective (subsetBIC handles GLM via ...):
#' ybin <- rbinom(n, 1, plogis(0.3 + X[, 1] - 0.5 * X[, 2]))
#' fit3 <- gareg_subset(ybin, X,
#' gaMethod = "ga",
#' family = stats::binomial(), # <- passed to subsetBIC via ...
#' gacontrol = list(popSize = 60, maxiter = 80, parallel = FALSE)
#' )
#' summary(fit3)
#' }
#' }
#'
#' @export
gareg_subset <- function(y,
X,
ObjFunc = NULL,
gaMethod = "ga",
gacontrol = NULL,
monitoring = FALSE,
seed = NULL,
...) {
if (!requireNamespace("GA", quietly = TRUE)) {
stop("Package 'GA' is required: install.packages('GA')")
}
call <- match.call()
X <- as.matrix(X)
nobs <- NROW(X)
p <- NCOL(X)
ga_fun <- if (is.function(gaMethod)) {
gaMethod
} else {
switch(tolower(as.character(gaMethod)),
"ga" = GA::ga,
"gaisl" = GA::gaisl,
stop("gaMethod must be 'ga' or 'gaisl' (from the GA package), or a GA-compatible function.")
)
}
ga_name <- if (identical(ga_fun, GA::gaisl)) "gaisl" else "ga"
if (is.null(ObjFunc)) {
ObjFunc <- subsetBIC
} else if (is.character(ObjFunc)) {
ObjFunc <- get(ObjFunc, mode = "function")
}
if (!is.function(ObjFunc)) stop("ObjFunc must be a function or a function name.", call. = FALSE)
engine_args <- as.list(gacontrol %||% list())
if (!is.null(seed)) engine_args$seed <- seed
if (isTRUE(monitoring) && is.null(engine_args$monitor)) engine_args$monitor <- GA::gaMonitor
if (isTRUE(monitoring)) message("Running best subset via GA (engine = ", ga_name, ")")
base_args <- list(
fitness = ObjFunc, # default as subsetBIC; expect chromosome with 0/1 mask
type = "binary",
nBits = p,
monitor = monitoring
)
GA.res <- do.call(
ga_fun,
c(
base_args,
engine_args,
list(y = y, X = X),
list(...)
)
)
sol <- GA.res@solution
if (is.matrix(sol)) sol <- sol[1L, ]
idx <- which(sol != 0)
mhat <- length(idx)
bestChrom <- c(mhat, idx)
`%||%` <- function(a, b) if (!is.null(a)) a else b
feat_names <- colnames(X) %||% character(p)
# Build gareg S4
object <- methods::new("gareg",
call = call,
method = "subset",
N = nobs,
objFunc = ObjFunc,
gaMethod = ga_name,
gaFit = GA.res,
featureNames = feat_names,
bestFitness = as.numeric(GA.res@fitnessValue),
bestChrom = as.numeric(bestChrom),
bestnumbsol = as.numeric(mhat),
bestsol = as.numeric(idx)
)
return(object)
}
#' Unified BIC-style Objective for Subset Selection (GLM & Gaussian)
#'
#' @description
#' Computes a BIC-like criterion for a chromosome that encodes a variable
#' subset. The same expression
#' \deqn{\mathrm{BIC} = n \log(\mathrm{rss\_like}/n) + k \log n}
#' is used for all families, where:
#' \itemize{
#' \item For Gaussian with identity link, \code{rss_like} is the residual sum of squares (RSS),
#' computed via a fast \code{.lm.fit}.
#' \item For other GLM families, \code{rss_like} is the residual \emph{deviance}
#' from \code{glm.fit}.
#' }
#' The effective parameter count \eqn{k} includes the intercept.
#'
#' @details
#' The chromosome \code{subset_bin} is a \emph{binary} vector (0/1 by column),
#' indicating which predictors from \code{X} are included. The design matrix
#' always includes an intercept. Rank-deficient selections return \code{Inf}
#' (which the GA maximizer treats as a very poor score). The value returned is
#' \strong{-BIC} so that GA engines can \emph{maximize} it.
#'
#' @param subset_bin Integer/numeric 0–1 vector (length \code{ncol(X)}); 1 means
#' the corresponding column of \code{X} is included in the model.
#' @param y Numeric response vector of length \code{n}.
#' @param X Numeric matrix of candidate predictors; columns correspond to variables.
#' @param family A GLM family object (default \code{stats::gaussian()}).
#' @param weights Optional prior weights (passed to \code{glm.fit}).
#' @param offset Optional offset (passed to \code{glm.fit}).
#' @param control GLM fit controls; default \code{stats::glm.control()}.
#'
#' @return A single numeric value: \strong{-BIC}. Larger is better for GA maximizers.
#' Returns \code{Inf} for rank-deficient designs.
#'
#' @seealso \code{\link[stats]{glm.fit}}, \code{\link[stats]{glm.control}},
#' \code{\link[stats]{.lm.fit}}
#'
#' @export
#' @importFrom stats glm.fit glm.control .lm.fit
subsetBIC <- function(subset_bin,
y,
X,
family = stats::gaussian(),
weights = NULL,
offset = NULL,
control = stats::glm.control()) {
n <- as.integer(length(y))
X <- as.matrix(X)
idX <- which(as.integer(abs(subset_bin) != 0) == 1L)
m <- length(idX)
## Construct the Design Matrix
if (m == 0L) {
Xmat <- matrix(1,
nrow = n, ncol = 1L,
dimnames = list(NULL, "(Intercept)")
)
} else {
Xsel <- X[, idX, drop = FALSE]
Xmat <- cbind(`(Intercept)` = 1, Xsel)
if (qr(Xmat)$rank < ncol(Xmat)) BIC_val <- Inf
}
k_eff <- NCOL(Xmat) # include intercept
if (identical(family$family, "gaussian") && identical(family$link, "identity")) {
## Gaussian-identity: use RSS
fit <- stats::.lm.fit(Xmat, y)
if (!is.null(fit$rank) && fit$rank < k_eff) BIC_val <- Inf
rss_like <- sum(fit$residuals^2)
} else {
## GLM: use deviance
fit <- stats::glm.fit(
x = Xmat, y = y,
family = family,
weights = weights,
offset = offset,
control = control
)
if (!is.null(fit$rank) && fit$rank < k_eff) BIC_val <- Inf
rss_like <- fit$deviance
if (!is.finite(rss_like) || rss_like <= 0) {
# avoid -Inf BIC
rss_like <- .Machine$double.eps
}
}
BIC_val <- n * log(rss_like / n) + k_eff * log(n)
return(-BIC_val) # GA::ga or GA::gaisl maximizing
}
#' False Discovery Rate (FDR) and True Positive Rate (TPR) from index labels
#'
#' @description
#' Computes the False Discovery Rate (FDR) and True Positive Rate (TPR, a.k.a. recall)
#' by comparing a set of \emph{true} labels to a set of \emph{predicted} labels.
#' Labels are treated as positive integer indices in \eqn{\{1, \dots, N\}}. Duplicates
#' are ignored (unique indices are used).
#'
#' @details
#' Let \code{truelabel} and \code{predlabel} be sets of indices. The function
#' derives the confusion-matrix counts:
#' \itemize{
#' \item \code{tp} = \eqn{|truelabel \cap predlabel|}
#' \item \code{fp} = \eqn{|predlabel \setminus truelabel|}
#' \item \code{fn} = \eqn{|truelabel \setminus predlabel|}
#' \item \code{tn} = \eqn{N - tp - fp - fn}
#' }
#' and returns
#' \deqn{FDR = fp / (fp + tp), \qquad TPR = tp / (tp + fn).}
#'
#' Inputs are coerced to integer and uniqued. A warning is emitted if any label
#' is < 1, and an error is thrown if any label exceeds \code{N}. If \code{tn < 0},
#' a warning is issued indicating that \code{N} may not reflect the full universe.
#'
#' @param truelabel Integer vector of ground-truth positive indices (values in \code{1..N}).
#' @param predlabel Integer vector of predicted positive indices (values in \code{1..N}).
#' @param N Integer scalar; size of the full index universe (total number of candidates).
#'
#' @return A named list with components:
#' \itemize{
#' \item \code{fdr} False Discovery Rate, \eqn{fp/(fp+tp)} (NaN if \code{fp+tp == 0}).
#' \item \code{tpr} True Positive Rate (recall), \eqn{tp/(tp+fn)} (NaN if \code{tp+fn == 0}).
#' \item \code{fp}, \code{fn}, \code{tp}, \code{tn} Confusion-matrix counts.
#' }
#'
#' @section Edge cases:
#' If \code{predlabel} is empty, \code{fdr} is \code{NaN} and \code{tpr} is 0 (unless
#' \code{truelabel} is also empty, in which case both \code{fdr} and \code{tpr} are \code{NaN}).
#' If \code{truelabel} is empty and \code{predlabel} non-empty, \code{tpr} is \code{NaN}
#' and \code{fdr} is 1.
#'
#' @examples
#' # Simple example
#' N <- 10
#' true <- c(2, 4, 7)
#' pred <- c(4, 5, 7, 7) # duplicates are ignored
#' FDRCalc(true, pred, N)
#'
#' # Empty predictions
#' FDRCalc(true, integer(0), N)
#'
#' # All correct predictions
#' FDRCalc(true, true, N)
#'
#' @export
FDRCalc <- function(truelabel, predlabel, N) {
truelabel <- unique(as.integer(truelabel))
predlabel <- unique(as.integer(predlabel))
tp <- length(intersect(truelabel, predlabel))
fp <- length(setdiff(predlabel, truelabel))
fn <- length(setdiff(truelabel, predlabel))
if (any(truelabel < 1L) | any(predlabel < 1L)) {
warning("Labels are expected to be positive indices (>= 1).")
}
if (any(truelabel > N) | any(predlabel > N)) {
stop("Found labels greater than N. Increase N or correct labels.")
}
tn <- N - tp - fp - fn
if (tn < 0) warning("Computed TN < 0; check that N reflects the full universe.")
fdr <- fp / (fp + tp)
tpr <- tp / (tp + fn)
return(list(fdr = fdr, tpr = tpr, fp = fp, fn = fn, tp = tp, tn = tn))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.