#' Scalable Automatic Feature Engineering
#'
#' Generate automatically new features based on older ones for further modelling,
#' using SAFE algoritm proposed in a paper by Shi, Zhang, Li, Yang and Zhou.
#' This is a direct implementation of the pseudo-algoritm proposed in the paper, with its conventions, denotements and flaws.
#'
#' @param X_train Matrix - data used to train model. Must be numerical.
#' @param y_train Factor - labels for training data. Must be \strong{binary}.
#' @param X_valid Matrix - data used to test model. Must be numerical.
#' @param y_valid Factor - labels for testing data. Must be \strong{binary}.
#' @param operators A \code{list} of lists of functions. Ith list of funcitons contains functions accepting \code{i} vectors of equal length and returning 1 vector of the same length.
#' @param n_iter Integer; Amount of iterations for the alghoritm to perform.
#' @param gamma Integer; Amount of most important feature combinations to be selected in each iteration.
#' @param nrounds Integer for \code{\link[xgboost]{xgb.train}}.
#' @param alpha Threshold for \code{link{IV}}. Features with IV < alpha will be dropped.
#' @param bins Integer; amount of bins to create to discretize features.
#' @param theta Threshold for Pearson's correlation. Features with correlation above theta will be dropped.
#' @param beta Integer; Maximum amount of features to be selected at the end of each loop. Set to \code{Inf} to select all features.
#'
#' @return
#' A \code{list} with 2 elements: \code{X_train} and \code{X_test}.
#' Both contain transformed train and test sets, ready for further modelling.
#' Unfortunately, this is in contrary to algoritm mentioned in the paper (which returns a function) - at least for now.
#' @export
SAFE <- function(X_train, y_train, X_valid, y_valid,
operators = list(NULL, list(`+`, `-`, `*`)),
n_iter = 10,
nrounds = 5,
alpha = 0.1,
gamma = 10,
bins = 30,
theta = 0.8,
beta = Inf){
# 0. check parameters
check_dataset(X_train, y_train)
check_dataset(X_valid, y_valid)
check_train_valid(X_train, X_valid)
check_operators(operators)
check_n_iter(n_iter)
check_nrounds(nrounds)
check_alpha(alpha)
check_gamma(gamma)
check_bins(bins)
check_theta(theta)
check_beta(beta)
# Though not mentioned explicitely in the paper, the algoritm is trained on both train and test data
X <- rbind(X_train, X_valid)
y <- forcats::fct_c(y_train, y_valid)
for(i in 1:n_iter){
# 1. train xgboost
bst <- xgboost::xgboost(data = X, label = y, nrounds = nrounds, verbose = 0)
# 2. Constitute feature combinations
feat_combos <- constitute_feat_combos(bst)
# 3. Sort and filter feature combinations
selected_feat_combos <- sort_filter_combos(X, y, feat_combos, seq_lengths = which(sapply(operators, function(op) !is.null(op))), gamma = gamma)
# 4. Generate new features
new_feats <- lapply(selected_feat_combos, function(feat_combo){
as.data.frame(lapply(operators[[length(feat_combo)]], function(op){
do.call(op, as.data.frame(X[,feat_combo]))
}))
})
new_feats <- data.matrix(do.call(cbind, new_feats))
new_X <- cbind(X, new_feats)
colnames(new_X) <- paste0("SAFE", 1:(ncol(new_feats) + ncol(X)))
# 5. Remove features with low predictive power
informative_X <- remove_uninformative_features(new_X, y, alpha = alpha, bins = bins)
nonredundant_X <- remove_redundant_features(informative_X, y, bins = bins, theta = theta)
# 6. Final feature sorting and selection
final_bst <- xgboost::xgboost(data = nonredundant_X, label = y, nrounds = nrounds, verbose = 0)
feat_gains <- calculate_avg_gain(final_bst, colnames(nonredundant_X))
sorted_gains <- feat_gains[order(feat_gains$gain, decreasing = TRUE)[1:min(nrow(feat_gains), beta)],]
X <- nonredundant_X[,sorted_gains[["Feature"]]]
cat(paste0("\n", i, "th iteration out of ", n_iter, " complete"))
}
cat("\n")
list(X_train = X[1:nrow(X_train),],
X_valid = X[(nrow(X_train) + 1):nrow(X),])
}
#' Constitute Feature Combinations
#' @param bst An object of \code{xgb.Booster} class
#' @import data.table
#' @return
#' a list of data.frames with 2 columns: `Feature` (name of variable) and `Split` (value at which split in tree occured).
#' Single feature might contain multiple split values.
#' @noRd
constitute_feat_combos <- function(bst){
nrounds <- bst$niter
model_dt <- xgboost::xgb.model.dt.tree(model = bst, trees = nrounds - 1, use_int_id = TRUE)
temp_env <- rlang::new_environment()
assign("result", list(), envir = temp_env)
# Search the tree recursively to find all paths
rec_search <- function(id, history){
# history - vector
# id - id of current node
current_feature <- model_dt[Node == id, Feature]
if(current_feature == 'Leaf'){
assign("result", append(get("result", envir = temp_env), list(history)), envir = temp_env)
return()
}
history <- rbind(history, data.frame(Feature = current_feature,
Split = model_dt[Node == id, Split],
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
rec_search(model_dt[Node == id, Yes], history)
rec_search(model_dt[Node == id, No], history)
}
# Initiate the search
rec_search(0, data.frame(Feature = character(0),
Split = numeric(0),
stringsAsFactors = FALSE))
# Return the result
get("result", envir = temp_env)
}
#' Sort & filter feature combinations
#' @param X Matrix
#' @param y Vector of labels
#' @param feat_combos A list a list of data.frames with 2 columns: `Feature` and `Split`. See `constitue_feat_combos`.
#' @param seq_lengths Vector of integers with info combinations of how many features to consider.
#' For instance, when user supplied only binary operators, only combinations of 2 features will be calculated.
#' @return A `list` of character vectors (feature combinations). Each vector contains feature names.
#' @noRd
sort_filter_combos <- function(X, y,feat_combos, seq_lengths, gamma){
# job - a set of features and values to split by
# To optimize (as suggested in the paper) the possible combinations are determined and redundant ones are filtered
necessary_jobs <- determine_jobs(feat_combos, seq_lengths)
# Split data
labels_splitted <- lapply(necessary_jobs, function(jobs_q){
lapply(jobs_q, function(job) execute_job(X, y, job))
})
# Drop the highest level
labels_splitted <- unlist(labels_splitted, recursive = FALSE)
# Calculate information gain
scores <- sapply(labels_splitted, information_gain)
# Return the most informative combinations
unlisted_jobs <- unlist(necessary_jobs, recursive = FALSE)
unlisted_job_names <- lapply(unlisted_jobs, function(job) unique(job$Feature))[order(scores, decreasing = TRUE)]
out <- unique(unlisted_job_names)
out[1:min(gamma, length(out))]
}
#' Determine necessary combinations to look at
#'
#' @param feat_combos A list a list of data.frames with 2 columns: `Feature` and `Split`. See `constitue_feat_combos`.
#' @param seq_lengths Vector of integers with info combinations of how many features to consider.
#' @return A nested list. On top of length == length(seq_lengths). Each element is a list of data.frames. Each of data.frames is a job to execute
#' @importFrom utils combn
#' @noRd
determine_jobs <- function(feat_combos, seq_lengths){
lapply(seq_lengths, function(q){
l <- lapply(feat_combos, function(combo){
feats <- unique(combo$Feature)
if(length(feats) < q) return(NULL)
all_combos <- combn(feats, q)
apply(all_combos, MARGIN = 2, function(row){
merge(data.frame(Feature = row,
stringsAsFactors = FALSE), combo)
})
})
unique(unlist(l, recursive = FALSE))
})
}
#' Split data
#'
#' Split data according to info in `job`
#' @param X Matrix
#' @param y Vector of labels
#' @param job A data.frame with 2 columns: Feature and Split. Feature must contain names from colnames(x)
#' @return A list of labels splitted accordingly `x` and `job`
#' @noRd
execute_job <- function(X, y, job){
jobs_splitted <- split(job, factor(job$Feature))
facs <- lapply(jobs_splitted, function(single_job){
vec <- X[,single_job$Feature[1]]
cut(vec, c(min(vec), single_job$Split, max(vec) + 1e-1), right = FALSE)
})
combined_facs <- do.call(forcats::fct_cross, facs)
split(y, combined_facs)
}
#' Remove features with low predictive power
#'
#' @param X Matrix
#' @param y Factor of labels
#' @param alpha threshold > 0. Features with IV < alpha will be discarded.
#' @param bins integer
#' @noRd
remove_uninformative_features <- function(X, y, alpha, bins){
IVs <- calculate_IVs(X, y, bins)
X[,IVs >= alpha]
}
#' Calculate information values
#'
#' @param X Matrix
#' @param y Factor of labels
#' @param bins Integer
#' @noRd
calculate_IVs <- function(X, y, bins){
apply(X, 2, function(feat){
breaks <- sort(feat)[round(1:(bins - 1) / bins * length(feat))]
breaks <- unique(c(min(feat) - 1e-2, breaks, max(feat)))
if(length(breaks) >= 3){
bin_assignments <- forcats::fct_drop(cut(feat, breaks))
y_splitted <- split(y, bin_assignments)
IV(y_splitted)
}
# If length(breaks) == 2 then no bins were created - the variable is constant and has no information
else 0
})
}
#' Remove highly correlated features
#'
#' @param X Matrix
#' @param y Factor of labels
#' @param alpha threshold > 0. Features with IV < alpha will be discarded.
#' @param theta Threshold
#' @importFrom stats cor
#' @noRd
remove_redundant_features <- function(X, y, bins, theta){
cors <- cor(X)
IVs <- calculate_IVs(X, y, bins)
indices <- which.matrix(abs(cors) >= theta, ncol(X))
final_indices <- sapply(indices, function(i){
i[which.max(IVs[i])]
})
X[,unique(final_indices)]
}
#' Which indices in matrix are TRUE?
#'
#' @param logvec - logical vector, created by `mat` `operator` `value`, i.e. M > 0
#' @param ncol number of columns in matrix
#'
#' @return A list of pairs (row_index, col_index) of elements in matrix which are TRUE.
#' Diagonals omitted
#' @noRd
which.matrix <- function(logvec, ncol){
wh <- which(logvec)
l <- lapply(wh, function(w){
modulos <- w %% ncol
out <- c(replace(modulos, modulos == 0, ncol), ceiling(w / ncol))
if(out[1] > out[2]) NULL
else out
})
l[sapply(l, function(el) !is.null(el))]
}
calculate_avg_gain <- function(bst, all_feats){
dt <- xgboost::xgb.model.dt.tree(model = bst)
gains <- dt[Feature != "Leaf", .(gain = mean(Quality)),Feature]
all_gains <- merge(data.frame(Feature = all_feats,
stringsAsFactors = FALSE), gains, all.x = TRUE)
all_gains[is.na(all_gains[["gain"]]), "gain"] <- 0
all_gains
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.