################################################################
# _____ _ ____ _ #
# |_ _| | | / __ \ (_) #
# | | _ __ | |_ ___ __ _ _ __| | | |_ __ ___ _ ___ ___ #
# | | | '_ \| __/ _ \/ _` | '__| | | | '_ ` _ \| |/ __/ __| #
# | |_| | | | || __| (_| | | | |__| | | | | | | | (__\__ \ #
# |_____|_| |_|\__\___|\__, |_| \____/|_| |_| |_|_|\___|___/ #
# __/ | #
# |___/ #
################################################################
################################################################
# @script: terga1.R
# @author: Edi Prifti
# @author: Lucas Robin
# @date: August 2016
################################################################
#' terga1: Model search algorithm based on genetic algorithms (GA)
#'
#' @title terga1
#' @description terga1 is a model search algorithm based on genetic algorithms (GA). A “genome” or “individual” in this context is a combination of features that will be associated together to compute a score that will be the prediction model. Depending on the type of fitting function that is maximized the fatures are weighed by specific coefficients. In short the algorithm is based on different operations such as crossing, mutating and evolving different “individuals” and evaluating their fitness to the “environment” which is represented by the variable to be predicted.
#' @param sparsity: number of features in a given model. This is a vector with multiple lengths.
#' @param size_pop: the number of individuals in a population to be evolved.
#' @param size_world: this is the number of features in the dataset.
#' @param max.nb.features: focuses only on the subset of top most significant features (default:1000)
#' @param popSourceFile: A population of models that can start as a first generation to be evolved (default:NULL).
#' @param popSaveFile: (??)
#' @param language is the language that is used by the different algorithms {bin, bininter, ter, terinter, ratio}, (default:"terinter")
#' @param scoreFormula: a Function that contains the ratio Formula or other specific ones
#' @param epsilon: a small value to be used with the ratio language (default: NULL). When null it is going to be calculated by the minimum value of X divided by 10.
#' @param unique_vars: logical (default: FALSE) indicates weather unique variables can be used in a model or population.
#' @param objective: this can be auc, cor or aic. Terga can also predict regression, other than class prediction. (default:auc)
#### TODO UPDATE THIS AS c(classification, regression)
#' @param estimate_coefs: non ternary solution for the aic objective (default:FALSE)
#' @param intercept: (Interceot for the a given model) (default:NULL)
#' @param evalToFit: The model performance attribute to use as fitting score (default:"fit_"). Other choices are c("auc_","accuracy_","precision_","recall_","f_score_")
#' @param k_penalty: Penalization of the fit by the k_sparsity (default: 0)
#' @param select_type: the selection operator type. can be mixed, elite or tournoi (default: mixed)
#' @param select_perc1: percentage of individuals to be selected with elite
#' @param select_perc2: percentage of individuals to be selected with tournoi
#' @param perc_best_ancestor: percentage of best ancentors as seeding in the new population
#' @param mutate_size: percentage of individuals in the population to be mutated
#' @param mutate_rate: percentage of features in an individual to be mutated
#' @param plot: plot graphics indicating the evolution of the simulation (default:FALSE)
#### TOCHECK is this still needed for tergaV2
#' @param convergence: should the algorithm converge when the best individual is not improving (default:TRUE).
#' @param convergence_steps: the number of generations after which we consider convergence (default:10).
#' @param evolve_k1: weather or not to evaluate exhaustively the features for k_sparse=1. This will take a lot of time if the dataset is large, thus the possibility to evolve this using the GA. (default:TRUE)
#' @param verbose: print out information on the progress of the algorithm (default:TRUE)
#' @param warnings: Print out warnings when runnig (default:FALSE).
#' @param debug: print debug information (default:FALSE)
#' @param print_ind_method: One of c("short","graphical") indicates how to print a model and subsequently a population during the run (default:"short").
#' @param parallelize.folds: parallelize folds when cross-validating (default:TRUE)
#' @param nb_generations: maximum number of generations to evolve the population.
#' @param nCores: the number of cores to execute the program. If nCores=1 than the program runs in a non parallel mode
#' @param seed: the seed to be used for reproductibility. If seed=NULL than it is not taken into account (default:NULL).
#### TODO check
#' @param experiment.id: The id of the experiment that is to be used in the plots and comparitive analyses (default is the learner's name, when not specified)
#' @param experiment.description: A longer description of the experiment. This is important when many experiments are run and can also be printed in by the printExperiment function.
#' @param experiment.save: Data from an experiment can be saved with different levels of completness, with options to be selected from c("nothing", "minimal", "full"), default is "minimal"
#' @return an object containing a list of parameters for this classifier
#' @export
terga1 <- function(sparsity = c(1:10),
# population options
size_pop = 100, size_world = "NULL", max.nb.features = 1000, popSourceFile = "NULL", popSaveFile = "NULL",
# language in {bin, bininter, ter, terinter, ratio}
language = "terinter",
# language options
scoreFormula=scoreRatio, epsilon = "NULL",
# individual options
unique_vars = FALSE,
# evaluation options
objective = "auc", k_penalty=0, evalToFit = "fit_", estimate_coefs = FALSE, intercept = "NULL",
# selection options
select_type = "mixed", select_perc1 = 20, select_perc2 = 30, perc_best_ancestor = 10,
# mutation options
mutate_size = 70, mutate_rate = 50,
# evolution options
nb_generations = 100, convergence = TRUE, convergence_steps = 10, evolve_k1 = TRUE,
# output options
plot = FALSE, verbose = TRUE, warnings = FALSE, debug = FALSE, print_ind_method = "short", parallelize.folds = TRUE,
# computing options
nCores = 4, seed = "NULL",
# experiment options
experiment.id = "NULL", experiment.description = "NULL", experiment.save = "nothing")
{
clf <- list() # create a classifier object
clf$learner <- "terga1" # name of the method
clf$params <- list() # parameter list
clf$experiment <- list() # information about the experiment
# POPULATION
clf$params$sparsity <- sparsity # number of non zero variables in the model
clf$params$current_sparsity <- NA # number of non zero variables in the model
clf$params$size_pop <- size_pop # how many models in the population to evolve
clf$params$size_world <- size_world # total number of variables
clf$params$max.nb.features <- max.nb.features
clf$params$nb_generations <- nb_generations # number of generation to evolve
clf$params$unique_vars <- unique_vars # weather in a model we can have one variable more than once
clf$params$popSourceFile <- popSourceFile
clf$params$popSaveFile <- popSaveFile
# FITTING
clf$params$objective <- objective # prediction (roc) or regression (cor)
clf$params$estimate_coefs <- estimate_coefs # integer or real estimated coefficients.
# SELECTION
clf$params$select_type <- select_type # selection method (mixte, elitist, tournoi)
clf$params$select_perc1 <- select_perc1 # selection percentage for elitist
clf$params$select_perc2 <- select_perc2 # selection percentage for tournoi
clf$params$perc_best_ancestor <- perc_best_ancestor # percentage of best k-1 models to be used as seed for the k simulation
# MUTATION
clf$params$mutate_size <- mutate_size # what percentage of models in the population are mutated
clf$params$mutate_rate <- mutate_rate # what percentage of the variables in the model are mutated
# CONVERGENCE
clf$params$convergence <- convergence # what should the simulation stop when convergence ?
clf$params$convergence_steps <- convergence_steps # after how many steps without improvement do we consider convergence?
clf$params$evolve_k1 <- evolve_k1 # weather to evolve models with k_1 or to search them exhaustively.
# print out intermediary results
clf$params$plot <- plot # plot results?
clf$params$verbose <- verbose # print out logs.
clf$params$warnings <- warnings # print out warnings
clf$params$debug <- debug # print out debugging information.
clf$params$print_ind_method <- print_ind_method # method to print individual
# Computing options
clf$params$nCores <- nCores # parallel computing
clf$params$parallel <- nCores > 1 # parallel computing
clf$params$parallelize.folds <- parallelize.folds
clf$params$parallel.local <- FALSE
clf$params$seed <- seed
# Evaluation options
clf$params$evalToFit <- evalToFit
clf$params$k_penalty <- k_penalty
clf$params$language <- language
#clf$params$useCustomLanguage <- useCustomLanguage
clf$params$epsilon <- epsilon
clf$params$scoreFormula <- scoreFormula
clf$params$intercept <- intercept
# Experiment information
if(!(experiment.id=="NULL"))
{
clf$experiment$id <- experiment.id
}else
{
clf$experiment$id <- clf$learner
}
if(!(experiment.description=="NULL"))
{
clf$experiment$description <- experiment.description
} else
{
clf$experiment$description <- paste(clf$learner, date() , sep = " ")
}
#match.arg(experiment.save)
clf$experiment$save <- experiment.save
return(clf)
}
#' Fit a Model Using an Evolutionary Algorithm with Controlled Parameters
#'
#' This function fits a model to the data using an evolutionary algorithm,
#' allowing for controlled sparsity, fitness evaluation, and multiple evolution
#' strategies. The function iteratively evolves a population of models,
#' optimizing for accuracy or correlation, and terminates based on specified
#' conditions.
#'
#' @param X A matrix or data frame of feature values, where each row represents
#' a feature and each column represents a sample.
#' @param y A response vector or target variable for supervised learning. The
#' length of \code{y} should match the number of columns in \code{X}.
#' @param clf A classifier object containing parameters for fitting, including:
#' - `params`: A list of parameters for fitting, such as `language`, `objective`, `evalToFit`, `sparsity`, `size_pop`, `nb_generations`, `maxTime`, and `evolver`.
#' - `feature.cor`: A matrix of feature correlations for pre-selection of features based on correlation.
#' - `functions`: A list of functions to be used within the fitting process, such as `evolve`, `mutate`, and `cross`.
#'
#' @details The function performs the following steps:
#'
#' 1. **Parameter Initialization**: Sets parameters based on the specified
#' `language` and adjusts `evalToFit` based on the `objective`. 2. **Feature
#' Pre-selection**: Optionally restricts `X` to the most significant features
#' based on `clf$feature.cor` if `max.nb.features` is specified. 3. **Population
#' Initialization**: Initializes a population of models based on `in_pop` (if
#' available) or generates a new population with random individuals. 4.
#' **Evolutionary Process**:
#' - Iteratively evolves the population over generations, evaluating and ranking individuals by fitness.
#' - Applies selection, crossover, and mutation strategies to generate new models.
#' - Stores the best models by sparsity level for each generation.
#' 5. **Termination**: Stops when the maximum number of generations or specified
#' execution time (`maxTime`) is reached.
#'
#' The function returns a collection of the best models, sorted by fitness, and
#' allows for plotting evolution if `plot` is set to `TRUE`.
#'
#' @return A list representing a model collection of the best individuals from
#' the final population, each evaluated by fitness and sparsity level.
#'
#' @examples
#' \dontrun{
#' X <- matrix(rnorm(1000), nrow = 10) # Random features
#' y <- sample(c(0, 1), 100, replace = TRUE) # Random binary response
#' clf <- list(
#' params = list(language = "bin", objective = "accuracy", evalToFit = "accuracy_", size_pop = 20,
#' nb_generations = 50, maxTime = 3600, evolver = "v2m"),
#' feature.cor = data.frame(p = runif(10)),
#' functions = list(evolver = evolve2m)
#' )
#' best_models <- terga1_fit(X, y, clf)
#' print(best_models)
#' }
#'
#' @export
terga1_fit <- function(X, y, clf) {
# Setting the language environment
switch(clf$params$language,
ter=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'ter'")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
}else
{
# note that here with the ter language we could not use auc to fit since the intercept should be 0
clf$params$intercept = 0
if(clf$params$evalToFit == "auc_")
{
clf$params$evalToFit <- "accuracy_"
warning("terga1_fit: changing evalToFit from auc_ to accuracy_ because of the language.")
}
}
},
terinter=
{
# ternary language without intercept (maximize the accuracy)
if(clf$params$verbose){print("Setting environment for the language 'terinter'")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
}
},
bin=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'bin'")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
}else
{
# note that here with the ter language we could not use auc to fit since the intercept should be 0
clf$params$intercept = 0
if(clf$params$evalToFit == "auc_")
{
clf$params$evalToFit <- "accuracy_"
warning("terga1_fit: changing evalToFit from auc_ to accuracy_ because of the language.")
}
}
},
bininter=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'bininter'")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
}
},
ratio=
{
# ternary language without intercept (maximize the auc)
if(clf$params$verbose){print("Setting environment for the language 'ratio'")}
if(clf$params$objective == "cor")
{
clf$params$evalToFit <- "cor_"
}
},
{
stop(paste("The language",clf$params$language, "is not implemented !"))
}
)
# if(clf$params$max.nb.features < nrow(X))
# {
# if(clf$params$verbose) print(paste("... ... restricting X to the",clf$params$max.nb.features,"most significant variables"))
# selected.features <- rownames(clf$feature.cor[order(clf$feature.cor$p),][1:min(clf$params$max.nb.features, nrow(X)),])
# X <- X[selected.features,]
# clf$coeffs_ <- clf$coeffs_[selected.features]
# clf$feature.cor <- clf$feature.cor[selected.features,]
# }
# set the size of the world
clf$params$size_world <- nrow(X)
# Print the experiment configuration
if(clf$params$verbose) printClassifier(obj = clf)
res <- list() # the final object containing the evolved models
for(i in clf$params$sparsity) # sparsity is = k, i.e. the number of features in a model
{
cat("\t\tResolving problem with\t", i, "\tvariables ...\n")
# set the current_sparsity
clf$params$current_sparsity <- i
# test for
if (clf$params$current_sparsity == 1 & !clf$params$evolve_k1) # if we want to evolve features for k_sparse=1 we create a normal population
{
pop_last <- as.list(1:nrow(X)) # create population with k_sparse = 1
}else
{
best_ancestor = NULL
if (exists("pop_last"))
{
if(length(evaluation) != 0)
{
best_ancestor = pop_last[[which.max(evaluation)]]
}
# build a new population seeded by the last one
pop <- population(clf = clf,
size_ind = i,
size_world = nrow(X),
best_ancestor = best_ancestor,
size_pop = clf$params$size_pop,
seed = clf$params$current_seed)
}else
{
# build a new population from scratch
pop <- population(clf = clf,
size_ind = i,
size_world = nrow(X),
best_ancestor = best_ancestor,
size_pop = clf$params$size_pop,
seed = clf$params$current_seed)
}
# if this is population with model objects we transform in an index population
if(isPopulation(obj = pop))
{
pop <- listOfModelsToListOfSparseVec(list.models = pop)
}
# then we evolve
pop_last <- evolve(X, y, clf, pop, seed = clf$params$current_seed)
}
# evaluate the fitting function for all the models of the populaion
# transform to a population of model objects
pop_last.mod <- listOfSparseVecToListOfModels(X = X, y = y, clf = clf, v = pop_last)
# evaluate the population
pop.last.eval <- evaluatePopulation(X, y, clf, pop_last.mod, force.re.evaluation = TRUE, eval.all = TRUE)
# get the evaluation vector
evaluation <- as.numeric(populationGet_X(element2get = "fit_", toVec = TRUE, na.rm = TRUE)(pop = pop.last.eval))
# get the evaluation
evaluation.ord <- order(abs(evaluation), decreasing = TRUE)
# order by best in front
evaluation <- evaluation[evaluation.ord]
pop_ordered_mod <- pop.last.eval[evaluation.ord] # we gain speed
# get the best individuals (should be the first)
best_individual_index <- which.max(abs(evaluation))
best_individual <- pop_ordered_mod[[best_individual_index]]
# print it out
if(clf$params$verbose)
{
if(isModel(best_individual))
{
try(cat(paste("gen =",i,"\t", printModel(mod = best_individual, method = clf$params$print_ind_method, score = "fit_"),"\n")), silent = TRUE)
}
}
# transform the indexes into models
if(!isPopulation(obj = pop_ordered_mod))
{
pop_ordered_mod <- evaluatePopulation(X, y, clf, pop_ordered_mod, force.re.evaluation = TRUE, eval.all = TRUE)
}
# keep only models that are unique
pop_ordered_mod <- unique(pop_ordered_mod)
if(!(clf$params$popSaveFile=="NULL"))
{
#pop2Save <- evaluatePopulation(X, y, clf, pop_ordered_mod, eval.all = TRUE)
pop2Save <- pop_ordered_mod
savePopulation(pop2Save, paste("generation", i, clf$params$popSaveFile, sep = "_"))
}
# save the whole list of models ordered by fitting score. The best is the first
res[[paste("k",i,sep = "_")]] <- pop_ordered_mod
}
return(res)
}
## End of the function terga_fit
#######################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.